def test_speaker_embedding(): # load config config = load_config(encoder_config_path) config.audio.resample = True # create a dummy speaker encoder model = setup_speaker_encoder_model(config) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder ap = AudioProcessor(**config.audio) manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) d_vector = manager.compute_d_vector(mel) assert d_vector.shape[1] == 256 # compute d_vector directly from an input file d_vector = manager.compute_d_vector_from_clip(sample_wav_path) d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) d_vector = torch.FloatTensor(d_vector) d_vector2 = torch.FloatTensor(d_vector2) assert d_vector.shape[0] == 256 assert (d_vector - d_vector2).sum() == 0.0 # compute d_vector from a list of wav files. d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2]) d_vector3 = torch.FloatTensor(d_vector3) assert d_vector3.shape[0] == 256 assert (d_vector - d_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path)
def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict: """Plot the predicted and the real waveform and their spectrograms. Args: y_hat (torch.tensor): Predicted waveform. y (torch.tensor): Real waveform. ap (AudioProcessor): Audio processor used to process the waveform. name_prefix (str, optional): Name prefix used to name the figures. Defaults to None. Returns: Dict: output figures keyed by the name of the figures. """ """Plot vocoder model results""" if name_prefix is None: name_prefix = "" # select an instance from batch y_hat = y_hat[0].squeeze().detach().cpu().numpy() y = y[0].squeeze().detach().cpu().numpy() spec_fake = ap.melspectrogram(y_hat).T spec_real = ap.melspectrogram(y).T spec_diff = np.abs(spec_fake - spec_real) # plot figure and save it fig_wave = plt.figure() plt.subplot(2, 1, 1) plt.plot(y) plt.title("groundtruth speech") plt.subplot(2, 1, 2) plt.plot(y_hat) plt.title("generated speech") plt.tight_layout() plt.close() figures = { name_prefix + "spectrogram/fake": plot_spectrogram(spec_fake), name_prefix + "spectrogram/real": plot_spectrogram(spec_real), name_prefix + "spectrogram/diff": plot_spectrogram(spec_diff), name_prefix + "speech_comparison": fig_wave, } return figures
def test_scaler(self): scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") conf.stats_path = scaler_stats_path conf.preemphasis = 0.0 conf.do_trim_silence = True conf.signal_norm = True ap = AudioProcessor(**conf) mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) self.ap.signal_norm = False self.ap.preemphasis = 0.0 # test scaler forward and backward transforms wav = self.ap.load_wav(WAV_FILE) mel_reference = self.ap.melspectrogram(wav) mel_norm = ap.melspectrogram(wav) mel_denorm = ap.denormalize(mel_norm) assert abs(mel_reference - mel_denorm).max() < 1e-4
def test_run( self, ap: AudioProcessor, samples: List[Dict], output: Dict # pylint: disable=unused-argument ) -> Tuple[Dict, Dict]: figures = {} audios = {} for idx, sample in enumerate(samples): x = torch.FloatTensor(sample[0]) x = x.to(next(self.parameters()).device) y_hat = self.inference(x, self.config.batched, self.config.target_samples, self.config.overlap_samples) x_hat = ap.melspectrogram(y_hat) figures.update({ f"test_{idx}/ground_truth": plot_spectrogram(x.T), f"test_{idx}/prediction": plot_spectrogram(x_hat.T), }) audios.update({f"test_{idx}/audio": y_hat}) return figures, audios
def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor): """Process wav and compute mel and quantized wave signal. It is mainly used by WaveRNN dataloader. Args: out_path (str): Parent folder path to save the files. config (Coqpit): Model config. ap (AudioProcessor): Audio processor. """ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) wav_files = find_wav_files(config.data_path) for path in tqdm(wav_files): wav_name = Path(path).stem quant_path = os.path.join(out_path, "quant", wav_name + ".npy") mel_path = os.path.join(out_path, "mel", wav_name + ".npy") y = ap.load_wav(path) mel = ap.melspectrogram(y) np.save(mel_path, mel) if isinstance(config.mode, int): quant = ap.mulaw_encode( y, qc=config.mode) if config.model_args.mulaw else ap.quantize( y, bits=config.mode) np.save(quant_path, quant)
class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.ap = AudioProcessor(**conf) def test_audio_synthesis(self): """1. load wav 2. set normalization parameters 3. extract mel-spec 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") def _test(max_norm, signal_norm, symmetric_norm, clip_norm): self.ap.max_norm = max_norm self.ap.signal_norm = signal_norm self.ap.symmetric_norm = symmetric_norm self.ap.clip_norm = clip_norm wav = self.ap.load_wav(WAV_FILE) mel = self.ap.melspectrogram(wav) wav_ = self.ap.inv_melspectrogram(mel) file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format( max_norm, signal_norm, symmetric_norm, clip_norm ) print(" | > Creating wav file at : ", file_name) self.ap.save_wav(wav_, OUT_PATH + file_name) # maxnorm = 1.0 _test(1.0, False, False, False) _test(1.0, True, False, False) _test(1.0, True, True, False) _test(1.0, True, False, True) _test(1.0, True, True, True) # maxnorm = 4.0 _test(4.0, False, False, False) _test(4.0, True, False, False) _test(4.0, True, True, False) _test(4.0, True, False, True) _test(4.0, True, True, True) def test_normalize(self): """Check normalization and denormalization for range values and consistency""" print(" > Testing normalization and denormalization.") wav = self.ap.load_wav(WAV_FILE) wav = self.ap.sound_norm(wav) # normalize audio to get abetter normalization range below. self.ap.signal_norm = False x = self.ap.melspectrogram(wav) x_old = x self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= 0 - 1, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.max_norm = 1.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3 self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.max_norm = 1.0 x_norm = self.ap.normalize(x) print( f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" ) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type assert x_norm.min() < 0, x_norm.min() x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3 def test_scaler(self): scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") conf.stats_path = scaler_stats_path conf.preemphasis = 0.0 conf.do_trim_silence = True conf.signal_norm = True ap = AudioProcessor(**conf) mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) self.ap.signal_norm = False self.ap.preemphasis = 0.0 # test scaler forward and backward transforms wav = self.ap.load_wav(WAV_FILE) mel_reference = self.ap.melspectrogram(wav) mel_norm = ap.melspectrogram(wav) mel_denorm = ap.denormalize(mel_norm) assert abs(mel_reference - mel_denorm).max() < 1e-4 def test_compute_f0(self): # pylint: disable=no-self-use ap = AudioProcessor(**conf) wav = ap.load_wav(WAV_FILE) pitch = ap.compute_f0(wav) mel = ap.melspectrogram(wav) assert pitch.shape[0] == mel.shape[1]
class TestTTSDataset(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): # load dataset meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) items = meta_data_train + meta_data_eval tokenizer, _ = TTSTokenizer.init_from_config(c) dataset = TTSDataset( outputs_per_step=r, compute_linear_spec=True, return_wav=True, tokenizer=tokenizer, ap=self.ap, samples=items, batch_group_size=bgs, min_text_len=c.min_text_len, max_text_len=c.max_text_len, min_audio_len=c.min_audio_len, max_audio_len=c.max_audio_len, start_by_longest=start_by_longest, ) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers, ) return dataloader, dataset def test_loader(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(1, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data["token_id"] _ = data["token_id_lengths"] speaker_name = data["speaker_names"] linear_input = data["linear"] mel_input = data["mel"] mel_lengths = data["mel_lengths"] _ = data["stop_targets"] _ = data["item_idxs"] wavs = data["waveform"] neg_values = text_input[text_input < 0] check_count = len(neg_values) # check basic conditions self.assertEqual(check_count, 0) self.assertEqual(linear_input.shape[0], mel_input.shape[0], c.batch_size) self.assertEqual(linear_input.shape[2], self.ap.fft_size // 2 + 1) self.assertEqual(mel_input.shape[2], c.audio["num_mels"]) self.assertEqual(wavs.shape[1], mel_input.shape[1] * c.audio.hop_length) self.assertIsInstance(speaker_name[0], str) # make sure that the computed mels and the waveform match and correctly computed mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy()) # remove padding in mel-spectrogram mel_dataloader = mel_input[0].T.numpy()[:, :mel_lengths[0]] # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding mel_new = mel_new[:, :mel_lengths[0]] ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length) mel_diff = (mel_new[:, :mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg] self.assertLess(abs(mel_diff.sum()), 1e-5) # check normalization ranges if self.ap.symmetric_norm: self.assertLessEqual(mel_input.max(), self.ap.max_norm) self.assertGreaterEqual( mel_input.min(), -self.ap.max_norm # pylint: disable=invalid-unary-operand-type ) self.assertLess(mel_input.min(), 0) else: self.assertLessEqual(mel_input.max(), self.ap.max_norm) self.assertGreaterEqual(mel_input.min(), 0) def test_batch_group_shuffle(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 16) last_length = 0 frames = dataset.samples for i, data in enumerate(dataloader): if i == self.max_loader_iter: break mel_lengths = data["mel_lengths"] avg_length = mel_lengths.numpy().mean() dataloader.dataset.preprocess_samples() is_items_reordered = False for idx, item in enumerate(dataloader.dataset.samples): if item != frames[idx]: is_items_reordered = True break self.assertGreaterEqual(avg_length, last_length) self.assertTrue(is_items_reordered) def test_start_by_longest(self): """Test start_by_longest option. Ther first item of the fist batch must be longer than all the other items. """ if ok_ljspeech: dataloader, _ = self._create_dataloader(2, c.r, 0, True) dataloader.dataset.preprocess_samples() for i, data in enumerate(dataloader): if i == self.max_loader_iter: break mel_lengths = data["mel_lengths"] if i == 0: max_len = mel_lengths[0] print(mel_lengths) self.assertTrue(all(max_len >= mel_lengths)) def test_padding_and_spectrograms(self): def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths): self.assertNotEqual(linear_input[idx, -1].sum(), 0) # check padding self.assertNotEqual(linear_input[idx, -2].sum(), 0) self.assertNotEqual(mel_input[idx, -1].sum(), 0) self.assertNotEqual(mel_input[idx, -2].sum(), 0) self.assertEqual(stop_target[idx, -1], 1) self.assertEqual(stop_target[idx, -2], 0) self.assertEqual(stop_target[idx].sum(), 1) self.assertEqual(len(mel_lengths.shape), 1) self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0]) self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0]) if ok_ljspeech: dataloader, _ = self._create_dataloader(1, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break linear_input = data["linear"] mel_input = data["mel"] mel_lengths = data["mel_lengths"] stop_target = data["stop_targets"] item_idx = data["item_idxs"] # check mel_spec consistency wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) mel = self.ap.melspectrogram(wav).astype("float32") mel = torch.FloatTensor(mel).contiguous() mel_dl = mel_input[0] # NOTE: Below needs to check == 0 but due to an unknown reason # there is a slight difference between two matrices. # TODO: Check this assert cond more in detail. self.assertLess(abs(mel.T - mel_dl).max(), 1e-5) # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() wav = self.ap.inv_melspectrogram(mel_spec.T) self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") # check linear-spec linear_spec = linear_input[0].cpu().numpy() wav = self.ap.inv_spectrogram(linear_spec.T) self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") # check the outputs check_conditions(0, linear_input, mel_input, stop_target, mel_lengths) # Test for batch size 2 dataloader, _ = self._create_dataloader(2, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break linear_input = data["linear"] mel_input = data["mel"] mel_lengths = data["mel_lengths"] stop_target = data["stop_targets"] item_idx = data["item_idxs"] # set id to the longest sequence in the batch if mel_lengths[0] > mel_lengths[1]: idx = 0 else: idx = 1 # check the longer item in the batch check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths) # check the other item in the batch self.assertEqual(linear_input[1 - idx, -1].sum(), 0) self.assertEqual(mel_input[1 - idx, -1].sum(), 0) self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1) self.assertEqual(stop_target[1, mel_lengths[1]:].sum(), stop_target.shape[1] - mel_lengths[1]) self.assertEqual(len(mel_lengths.shape), 1)
class TWEBDataset(Dataset): def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power, min_seq_len=0): with open(csv_file, "r") as f: self.frames = [line.split('\t') for line in f] self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.min_seq_len = min_seq_len self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading TWEB from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) self._sort_frames() def load_wav(self, filename): try: audio = librosa.core.load(filename, sr=self.sample_rate) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) def _sort_frames(self): r"""Sort sequences in ascending order""" lengths = np.array([len(ins[1]) for ins in self.frames]) print(" | > Max length sequence {}".format(np.max(lengths))) print(" | > Min length sequence {}".format(np.min(lengths))) print(" | > Avg length sequence {}".format(np.mean(lengths))) idxs = np.argsort(lengths) new_frames = [] ignored = [] for i, idx in enumerate(idxs): length = lengths[idx] if length < self.min_seq_len: ignored.append(idx) else: new_frames.append(self.frames[idx]) print(" | > {} instances are ignored by min_seq_len ({})".format( len(ignored), self.min_seq_len)) self.frames = new_frames def __len__(self): return len(self.frames) def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav' text = self.frames[idx][1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample def collate_fn(self, batch): r""" Perform preprocessing and create a final data batch: 1. PAD sequences with the longest sequence in the batch 2. Convert Audio signal to Spectrograms. 3. PAD sequences that can be divided by r. 4. Convert Numpy to Torch tensors. """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): keys = list() wav = [d['wav'] for d in batch] item_idxs = [d['item_idx'] for d in batch] text = [d['text'] for d in batch] text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame # compute 'stop token' targets stop_targets = [ np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths ] # PAD stop targets stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) # PAD features with largest length + a zero frame linear = prepare_tensor(linear, self.outputs_per_step) mel = prepare_tensor(mel, self.outputs_per_step) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] # B x T x D linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) linear = torch.FloatTensor(linear) mel = torch.FloatTensor(mel) mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[ 0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0]))))
class TestTTSDataset(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") dataset = TTSDataset.MyDataset( r, c.text_cleaner, compute_linear_spec=True, ap=self.ap, meta_data=items, tp=c.characters, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), use_phonemes=False, ) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers, ) return dataloader, dataset def test_loader(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] neg_values = text_input[text_input < 0] check_count = len(neg_values) assert check_count == 0, " !! Negative values in text_input: {}".format( check_count) # TODO: more assertion here assert isinstance(speaker_name[0], str) assert linear_input.shape[0] == c.batch_size assert linear_input.shape[2] == self.ap.fft_size // 2 + 1 assert mel_input.shape[0] == c.batch_size assert mel_input.shape[2] == c.audio["num_mels"] # check normalization ranges if self.ap.symmetric_norm: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= -self.ap.max_norm # pylint: disable=invalid-unary-operand-type assert mel_input.min() < 0 else: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= 0 def test_batch_group_shuffle(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 16) last_length = 0 frames = dataset.items for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] avg_length = mel_lengths.numpy().mean() assert avg_length >= last_length dataloader.dataset.sort_items() is_items_reordered = False for idx, item in enumerate(dataloader.dataset.items): if item != frames[idx]: is_items_reordered = True break assert is_items_reordered def test_padding_and_spec(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(1, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] # check mel_spec consistency wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) mel = self.ap.melspectrogram(wav).astype("float32") mel = torch.FloatTensor(mel).contiguous() mel_dl = mel_input[0] # NOTE: Below needs to check == 0 but due to an unknown reason # there is a slight difference between two matrices. # TODO: Check this assert cond more in detail. assert abs(mel.T - mel_dl).max() < 1e-5, abs(mel.T - mel_dl).max() # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() wav = self.ap.inv_melspectrogram(mel_spec.T) self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") # check linear-spec linear_spec = linear_input[0].cpu().numpy() wav = self.ap.inv_spectrogram(linear_spec.T) self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") # check the last time step to be zero padded assert linear_input[0, -1].sum() != 0 assert linear_input[0, -2].sum() != 0 assert mel_input[0, -1].sum() != 0 assert mel_input[0, -2].sum() != 0 assert stop_target[0, -1] == 1 assert stop_target[0, -2] == 0 assert stop_target.sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[0] == linear_input[0].shape[0] assert mel_lengths[0] == mel_input[0].shape[0] # Test for batch size 2 dataloader, dataset = self._create_dataloader(2, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] if mel_lengths[0] > mel_lengths[1]: idx = 0 else: idx = 1 # check the first item in the batch assert linear_input[idx, -1].sum() != 0 assert linear_input[idx, -2].sum() != 0, linear_input assert mel_input[idx, -1].sum() != 0 assert mel_input[idx, -2].sum() != 0, mel_input assert stop_target[idx, -1] == 1 assert stop_target[idx, -2] == 0 assert stop_target[idx].sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[idx] == mel_input[idx].shape[0] assert mel_lengths[idx] == linear_input[idx].shape[0] # check the second itme in the batch assert linear_input[1 - idx, -1].sum() == 0 assert mel_input[1 - idx, -1].sum() == 0 assert stop_target[1, mel_lengths[1] - 1] == 1 assert stop_target[1, mel_lengths[1]:].sum() == 0 assert len(mel_lengths.shape) == 1
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description="Compute mean and variance of spectrogtram features.") parser.add_argument("--config_path", type=str, required=True, help="TTS config file path.") parser.add_argument("--out_path", default=None, type=str, help="directory to save the output file.") args = parser.parse_args() # load config CONFIG = load_config(args.config_path) CONFIG.audio['signal_norm'] = False # do not apply earlier normalization CONFIG.audio['stats_path'] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) # load the meta data of target dataset dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel**2).sum(axis=1) linear_square_sum += (linear**2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2) output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} stats['mel_mean'] = mel_mean stats['mel_std'] = mel_scale stats['linear_mean'] = linear_mean stats['linear_std'] = linear_scale print(f' > Avg mel spec mean: {mel_mean.mean()}') print(f' > Avg mel spec scale: {mel_scale.mean()}') print(f' > Avg linear spec mean: {linear_mean.mean()}') print(f' > Avg lienar spec scale: {linear_scale.mean()}') # set default config values for mean-var scaling CONFIG.audio['stats_path'] = output_file_path CONFIG.audio['signal_norm'] = True # remove redundant values del CONFIG.audio['max_norm'] del CONFIG.audio['min_level_db'] del CONFIG.audio['symmetric_norm'] del CONFIG.audio['clip_norm'] stats['audio_config'] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) print(f' > scale_stats.npy is saved to {output_file_path}')
# ### Check audio loading # In[ ]: wav = AP.load_wav(file_paths[10]) ipd.Audio(data=wav, rate=AP.sample_rate) # ### Generate Mel-Spectrogram and Re-synthesis with GL # In[ ]: mel = AP.melspectrogram(wav) print("Max:", mel.max()) print("Min:", mel.min()) print("Mean:", mel.mean()) plot_spectrogram(mel.T, AP); wav_gen = AP.inv_mel_spectrogram(mel) ipd.Audio(wav_gen, rate=AP.sample_rate) # ### Generate Linear-Spectrogram and Re-synthesis with GL # In[ ]: spec = AP.spectrogram(wav)
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") parser.add_argument("out_path", type=str, help="save path (directory and filename).") parser.add_argument( "--data_path", type=str, required=False, help="folder including the target set of wavs overriding dataset config.", ) args, overrides = parser.parse_known_args() CONFIG = load_config(args.config_path) CONFIG.parse_known_args(overrides, relaxed_parser=True) # load config CONFIG.audio.signal_norm = False # do not apply earlier normalization CONFIG.audio.stats_path = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio.to_dict()) # load the meta data of target dataset if args.data_path: dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True) else: dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 mel_square_sum = 0 linear_sum = 0 linear_square_sum = 0 N = 0 for item in tqdm(dataset_items): # compute features wav = ap.load_wav(item if isinstance(item, str) else item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) # compute stats N += mel.shape[1] mel_sum += mel.sum(1) linear_sum += linear.sum(1) mel_square_sum += (mel ** 2).sum(axis=1) linear_square_sum += (linear ** 2).sum(axis=1) mel_mean = mel_sum / N mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) output_file_path = args.out_path stats = {} stats["mel_mean"] = mel_mean stats["mel_std"] = mel_scale stats["linear_mean"] = linear_mean stats["linear_std"] = linear_scale print(f" > Avg mel spec mean: {mel_mean.mean()}") print(f" > Avg mel spec scale: {mel_scale.mean()}") print(f" > Avg linear spec mean: {linear_mean.mean()}") print(f" > Avg lienar spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling CONFIG.audio.stats_path = output_file_path CONFIG.audio.signal_norm = True # remove redundant values del CONFIG.audio.max_norm del CONFIG.audio.min_level_db del CONFIG.audio.symmetric_norm del CONFIG.audio.clip_norm stats["audio_config"] = CONFIG.audio.to_dict() np.save(output_file_path, stats, allow_pickle=True) print(f" > stats saved to {output_file_path}")
class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestAudio, self).__init__(*args, **kwargs) self.ap = AudioProcessor(**conf.audio) def test_audio_synthesis(self): """ 1. load wav 2. set normalization parameters 3. extract mel-spec 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") def _test(max_norm, signal_norm, symmetric_norm, clip_norm): self.ap.max_norm = max_norm self.ap.signal_norm = signal_norm self.ap.symmetric_norm = symmetric_norm self.ap.clip_norm = clip_norm wav = self.ap.load_wav(WAV_FILE) mel = self.ap.melspectrogram(wav) wav_ = self.ap.inv_mel_spectrogram(mel) file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav"\ .format(max_norm, signal_norm, symmetric_norm, clip_norm) print(" | > Creating wav file at : ", file_name) self.ap.save_wav(wav_, OUT_PATH + file_name) # maxnorm = 1.0 _test(1., False, False, False) _test(1., True, False, False) _test(1., True, True, False) _test(1., True, False, True) _test(1., True, True, True) # maxnorm = 4.0 _test(4., False, False, False) _test(4., True, False, False) _test(4., True, True, False) _test(4., True, False, True) _test(4., True, True, True) def test_normalize(self): """Check normalization and denormalization for range values and consistency """ print(" > Testing normalization and denormalization.") wav = self.ap.load_wav(WAV_FILE) self.ap.signal_norm = False x = self.ap.melspectrogram(wav) x_old = x self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= 0 - 1, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.max_norm = 1.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3 self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.max_norm = 1.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() assert x_norm.min() < 0, x_norm.min() x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3
class SpeakerManager: """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information in a way that can be queried by speaker or clip. There are 3 different scenarios considered: 1. Models using speaker embedding layers. The datafile only maps speaker names to ids used by the embedding layer. 2. Models using d-vectors. The datafile includes a dictionary in the following format. :: { 'clip_name.wav':{ 'name': 'speakerA', 'embedding'[<d_vector_values>] }, ... } 3. Computing the d-vectors by the speaker encoder. It loads the speaker encoder model and computes the d-vectors for a given clip or speaker. Args: d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by TTS models. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". Examples: >>> # load audio processor and speaker encoder >>> ap = AudioProcessor(**config.audio) >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) >>> # load a sample audio and compute embedding >>> waveform = ap.load_wav(sample_wav_path) >>> mel = ap.melspectrogram(waveform) >>> d_vector = manager.compute_d_vector(mel.T) """ def __init__( self, data_items: List[List[Any]] = None, d_vectors_file_path: str = "", speaker_id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", use_cuda: bool = False, ): self.d_vectors = {} self.speaker_ids = {} self.clip_ids = [] self.speaker_encoder = None self.speaker_encoder_ap = None self.use_cuda = use_cuda if data_items: self.speaker_ids, _ = self.parse_speakers_from_data(data_items) if d_vectors_file_path: self.set_d_vectors_from_file(d_vectors_file_path) if speaker_id_file_path: self.set_speaker_ids_from_file(speaker_id_file_path) if encoder_model_path and encoder_config_path: self.init_speaker_encoder(encoder_model_path, encoder_config_path) @staticmethod def _load_json(json_file_path: str) -> Dict: with fsspec.open(json_file_path, "r") as f: return json.load(f) @staticmethod def _save_json(json_file_path: str, data: dict) -> None: with fsspec.open(json_file_path, "w") as f: json.dump(data, f, indent=4) @property def num_speakers(self): return len(self.speaker_ids) @property def speaker_names(self): return list(self.speaker_ids.keys()) @property def d_vector_dim(self): """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero.""" if self.d_vectors: return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"]) return 0 @staticmethod def parse_speakers_from_data(items: list) -> Tuple[Dict, int]: """Parse speaker IDs from data samples retured by `load_tts_samples()`. Args: items (list): Data sampled returned by `load_tts_samples()`. Returns: Tuple[Dict, int]: speaker IDs and number of speakers. """ speakers = sorted({item[2] for item in items}) speaker_ids = {name: i for i, name in enumerate(speakers)} num_speakers = len(speaker_ids) return speaker_ids, num_speakers def set_speaker_ids_from_data(self, items: List) -> None: """Set speaker IDs from data samples. Args: items (List): Data sampled returned by `load_tts_samples()`. """ self.speaker_ids, _ = self.parse_speakers_from_data(items) def set_speaker_ids_from_file(self, file_path: str) -> None: """Set speaker IDs from a file. Args: file_path (str): Path to the file. """ self.speaker_ids = self._load_json(file_path) def save_speaker_ids_to_file(self, file_path: str) -> None: """Save speaker IDs to a json file. Args: file_path (str): Path to the output file. """ self._save_json(file_path, self.speaker_ids) def save_d_vectors_to_file(self, file_path: str) -> None: """Save d_vectors to a json file. Args: file_path (str): Path to the output file. """ self._save_json(file_path, self.d_vectors) def set_d_vectors_from_file(self, file_path: str) -> None: """Load d_vectors from a json file. Args: file_path (str): Path to the target json file. """ self.d_vectors = self._load_json(file_path) speakers = sorted({x["name"] for x in self.d_vectors.values()}) self.speaker_ids = {name: i for i, name in enumerate(speakers)} self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) def get_d_vector_by_clip(self, clip_idx: str) -> List: """Get d_vector by clip ID. Args: clip_idx (str): Target clip ID. Returns: List: d_vector as a list. """ return self.d_vectors[clip_idx]["embedding"] def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]: """Get all d_vectors of a speaker. Args: speaker_idx (str): Target speaker ID. Returns: List[List]: all the d_vectors of the given speaker. """ return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean d_vector of a speaker ID. Args: speaker_idx (str): Target speaker ID. num_samples (int, optional): Number of samples to be averaged. Defaults to None. randomize (bool, optional): Pick random `num_samples` of d_vectors. Defaults to False. Returns: np.ndarray: Mean d_vector. """ d_vectors = self.get_d_vectors_by_speaker(speaker_idx) if num_samples is None: d_vectors = np.stack(d_vectors).mean(0) else: assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" if randomize: d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0) else: d_vectors = np.stack(d_vectors[:num_samples]).mean(0) return d_vectors def get_random_speaker_id(self) -> Any: """Get a random d_vector. Args: Returns: np.ndarray: d_vector. """ if self.speaker_ids: return self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]] return None def get_random_d_vector(self) -> Any: """Get a random D ID. Args: Returns: np.ndarray: d_vector. """ if self.d_vectors: return self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"] return None def get_speakers(self) -> List: return self.speaker_ids def get_clips(self) -> List: return sorted(self.d_vectors.keys()) def init_speaker_encoder(self, model_path: str, config_path: str) -> None: """Initialize a speaker encoder model. Args: model_path (str): Model file path. config_path (str): Model config file path. """ self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list: """Compute a d_vector from a given audio file. Args: wav_file (Union[str, List[str]]): Target file path. Returns: list: Computed d_vector. """ def _compute(wav_file: str): waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) if not self.speaker_encoder_config.model_params.get("use_torch_spec", False): m_input = self.speaker_encoder_ap.melspectrogram(waveform) m_input = torch.from_numpy(m_input) else: m_input = torch.from_numpy(waveform) if self.use_cuda: m_input = m_input.cuda() m_input = m_input.unsqueeze(0) d_vector = self.speaker_encoder.compute_embedding(m_input) return d_vector if isinstance(wav_file, list): # compute the mean d_vector d_vectors = None for wf in wav_file: d_vector = _compute(wf) if d_vectors is None: d_vectors = d_vector else: d_vectors += d_vector return (d_vectors / len(wav_file))[0].tolist() d_vector = _compute(wav_file) return d_vector[0].tolist() def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: """Compute d_vector from features. Args: feats (Union[torch.Tensor, np.ndarray]): Input features. Returns: List: computed d_vector. """ if isinstance(feats, np.ndarray): feats = torch.from_numpy(feats) if feats.ndim == 2: feats = feats.unsqueeze(0) if self.use_cuda: feats = feats.cuda() return self.speaker_encoder.compute_embedding(feats) def run_umap(self): # TODO: implement speaker encoder raise NotImplementedError def plot_embeddings(self): # TODO: implement speaker encoder raise NotImplementedError
def test_compute_f0(self): # pylint: disable=no-self-use ap = AudioProcessor(**conf) wav = ap.load_wav(WAV_FILE) pitch = ap.compute_f0(wav) mel = ap.melspectrogram(wav) assert pitch.shape[0] == mel.shape[1]
class SpeakerManager: """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information in a way that you can query. There are 3 different scenarios considered. 1. Models using speaker embedding layers. The metafile only includes a mapping of speaker names to ids. 2. Models using external embedding vectors (x vectors). The metafile includes a dictionary in the following format. ``` { 'clip_name.wav':{ 'name': 'speakerA', 'embedding'[<x_vector_values>] }, ... } ``` 3. Computing x vectors at inference with the speaker encoder. It loads the speaker encoder model and computes x vectors for a given instance. >>> >>> # load audio processor and speaker encoder >>> ap = AudioProcessor(**config.audio) >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) >>> # load a sample audio and compute embedding >>> waveform = ap.load_wav(sample_wav_path) >>> mel = ap.melspectrogram(waveform) >>> x_vector = manager.compute_x_vector(mel.T) Args: x_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by the TTS model. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". """ def __init__( self, x_vectors_file_path: str = "", speaker_id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", ): self.x_vectors = None self.speaker_ids = None self.clip_ids = None self.speaker_encoder = None self.speaker_encoder_ap = None if x_vectors_file_path: self.load_x_vectors_file(x_vectors_file_path) if speaker_id_file_path: self.load_ids_file(speaker_id_file_path) if encoder_model_path and encoder_config_path: self.init_speaker_encoder(encoder_model_path, encoder_config_path) @staticmethod def _load_json(json_file_path: str): with open(json_file_path) as f: return json.load(f) @staticmethod def _save_json(json_file_path: str, data: dict): with open(json_file_path, "w") as f: json.dump(data, f, indent=4) @property def num_speakers(self): return len(self.speaker_ids) @property def x_vector_dim(self): return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) def parser_speakers_from_items(self, items: list): speaker_ids = sorted({item[2] for item in items}) self.speaker_ids = speaker_ids num_speakers = len(speaker_ids) return speaker_ids, num_speakers def save_ids_file(self, file_path: str): self._save_json(file_path, self.speaker_ids) def load_ids_file(self, file_path: str): self.speaker_ids = self._load_json(file_path) def save_x_vectors_file(self, file_path: str): self._save_json(file_path, self.x_vectors) def load_x_vectors_file(self, file_path: str): self.x_vectors = self._load_json(file_path) self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) def get_x_vector_by_clip(self, clip_idx: str): return self.x_vectors[clip_idx]["embedding"] def get_x_vectors_by_speaker(self, speaker_idx: str): return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False): x_vectors = self.get_x_vectors_by_speaker(speaker_idx) if num_samples is None: x_vectors = np.stack(x_vectors).mean(0) else: assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" if randomize: x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0) else: x_vectors = np.stack(x_vectors[:num_samples]).mean(0) return x_vectors def get_speakers(self): return self.speaker_ids def get_clips(self): return sorted(self.x_vectors.keys()) def init_speaker_encoder(self, model_path: str, config_path: str) -> None: self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, True) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) # normalize the input audio level and trim silences self.speaker_encoder_ap.do_sound_norm = True self.speaker_encoder_ap.do_trim_silence = True def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: def _compute(wav_file: str): waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) spec = self.speaker_encoder_ap.melspectrogram(waveform) spec = torch.from_numpy(spec.T) spec = spec.unsqueeze(0) x_vector = self.speaker_encoder.compute_embedding(spec) return x_vector if isinstance(wav_file, list): # compute the mean x_vector x_vectors = None for wf in wav_file: x_vector = _compute(wf) if x_vectors is None: x_vectors = x_vector else: x_vectors += x_vector return (x_vectors / len(wav_file))[0].tolist() x_vector = _compute(wav_file) return x_vector[0].tolist() def compute_x_vector(self, feats): if isinstance(feats, np.ndarray): feats = torch.from_numpy(feats) if feats.ndim == 2: feats = feats.unsqueeze(0) return self.speaker_encoder.compute_embedding(feats) def run_umap(self): # TODO: implement speaker encoder raise NotImplementedError def plot_embeddings(self): # TODO: implement speaker encoder raise NotImplementedError
base_values = sorted(10 * np.random.uniform(size=args.search_depth)) print(base_values) exponents = 10**np.linspace(-6, -1, num=args.num_iter) best_error = float('inf') best_schedule = None total_search_iter = len(base_values)**args.num_iter for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter): beta = exponents * base model.compute_noise_level(beta) for data in loader: mel, audio = data y_hat = model.inference(mel.cuda() if args.use_cuda else mel) if args.use_cuda: y_hat = y_hat.cpu() y_hat = y_hat.numpy() mel_hat = [] for i in range(y_hat.shape[0]): m = ap.melspectrogram(y_hat[i, 0])[:, :-1] mel_hat.append(torch.from_numpy(m)) mel_hat = torch.stack(mel_hat) mse = torch.sum((mel - mel_hat)**2).mean() if mse.item() < best_error: best_error = mse.item() best_schedule = {'beta': beta} print(f" > Found a better schedule. - MSE: {mse.item()}") np.save(args.output_path, best_schedule)
def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers): ''' run dataloader with given parameters and check conditions ''' ap = AudioProcessor(**C.audio) _, train_items = load_wav_data(test_data_path, 10) dataset = GANDataset(ap, train_items, seq_len=seq_len, hop_len=hop_len, pad_short=2000, conv_pad=conv_pad, return_segments=return_segments, use_noise_augment=use_noise_augment, use_cache=use_cache) loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, drop_last=True) max_iter = 10 count_iter = 0 # return random segments or return the whole audio if return_segments: for item1, _ in loader: feat1, wav1 = item1 # feat2, wav2 = item2 expected_feat_shape = (batch_size, ap.num_mels, seq_len // hop_len + conv_pad * 2) # check shapes assert np.all(feat1.shape == expected_feat_shape ), f" [!] {feat1.shape} vs {expected_feat_shape}" assert (feat1.shape[2] - conv_pad * 2) * hop_len == wav1.shape[2] # check feature vs audio match if not use_noise_augment: for idx in range(batch_size): audio = wav1[idx].squeeze() feat = feat1[idx] mel = ap.melspectrogram(audio) # the first 2 and the last frame is skipped due to the padding # applied in spec. computation. assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum( ) == 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum()}' count_iter += 1 # if count_iter == max_iter: # break else: for item in loader: feat, wav = item expected_feat_shape = (batch_size, ap.num_mels, (wav.shape[-1] // hop_len) + (conv_pad * 2)) assert np.all(feat.shape == expected_feat_shape ), f" [!] {feat.shape} vs {expected_feat_shape}" assert (feat.shape[2] - conv_pad * 2) * hop_len == wav.shape[2] count_iter += 1 if count_iter == max_iter: break
class LJSpeechDataset(Dataset): def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power): with open(csv_file, "r") as f: self.frames = [line.split('|') for line in f] self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) self._sort_frames() def load_wav(self, filename): try: audio = librosa.core.load(filename, sr=self.sample_rate) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) def _sort_frames(self): r"""Sort sequences in ascending order""" lengths = np.array([len(ins[1]) for ins in self.frames]) print(" | > Max length sequence {}".format(np.max(lengths))) print(" | > Min length sequence {}".format(np.min(lengths))) print(" | > Avg length sequence {}".format(np.mean(lengths))) idxs = np.argsort(lengths) new_frames = [None] * len(lengths) for i, idx in enumerate(idxs): new_frames[i] = self.frames[idx] self.frames = new_frames def __len__(self): return len(self.frames) def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav' text = self.frames[idx][1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample def get_dummy_data(self): r"""Get a dummy input for testing""" return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor) def collate_fn(self, batch): r""" Perform preprocessing and create a final data batch: 1. PAD sequences with the longest sequence in the batch 2. Convert Audio signal to Spectrograms. 3. PAD sequences that can be divided by r. 4. Convert Numpy to Torch tensors. """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): keys = list() wav = [d['wav'] for d in batch] item_idxs = [d['item_idx'] for d in batch] text = [d['text'] for d in batch] text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) linear = np.array([self.ap.spectrogram(w).astype('float32') for w in wav]) mel = np.array([self.ap.melspectrogram(w).astype('float32') for w in wav]) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] # PAD with zeros that can be divided by outputs per step if (timesteps + 1) % self.outputs_per_step != 0: pad_len = self.outputs_per_step - \ ((timesteps + 1) % self.outputs_per_step) pad_len += 1 else: pad_len = 1 linear = pad_per_step(linear, pad_len) mel = pad_per_step(mel, pad_len) # reshape jombo linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) linear = torch.FloatTensor(linear) mel = torch.FloatTensor(mel) return text, text_lenghts, linear, mel, item_idxs[0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}" .format(type(batch[0]))))
class EmbeddingManager(BaseIDManager): """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this. It defines common `Embedding` manager specific functions. """ def __init__( self, embedding_file_path: str = "", id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", use_cuda: bool = False, ): super().__init__(id_file_path=id_file_path) self.embeddings = {} self.embeddings_by_names = {} self.clip_ids = [] self.encoder = None self.encoder_ap = None self.use_cuda = use_cuda if embedding_file_path: self.load_embeddings_from_file(embedding_file_path) if encoder_model_path and encoder_config_path: self.init_encoder(encoder_model_path, encoder_config_path, use_cuda) @property def embedding_dim(self): """Dimensionality of embeddings. If embeddings are not loaded, returns zero.""" if self.embeddings: return len(self.embeddings[list( self.embeddings.keys())[0]]["embedding"]) return 0 def save_embeddings_to_file(self, file_path: str) -> None: """Save embeddings to a json file. Args: file_path (str): Path to the output file. """ save_file(self.embeddings, file_path) def load_embeddings_from_file(self, file_path: str) -> None: """Load embeddings from a json file. Args: file_path (str): Path to the target json file. """ self.embeddings = load_file(file_path) speakers = sorted({x["name"] for x in self.embeddings.values()}) self.ids = {name: i for i, name in enumerate(speakers)} self.clip_ids = list( set(sorted(clip_name for clip_name in self.embeddings.keys()))) # cache embeddings_by_names for fast inference using a bigger speakers.json self.embeddings_by_names = self.get_embeddings_by_names() def get_embedding_by_clip(self, clip_idx: str) -> List: """Get embedding by clip ID. Args: clip_idx (str): Target clip ID. Returns: List: embedding as a list. """ return self.embeddings[clip_idx]["embedding"] def get_embeddings_by_name(self, idx: str) -> List[List]: """Get all embeddings of a speaker. Args: idx (str): Target name. Returns: List[List]: all the embeddings of the given speaker. """ return self.embeddings_by_names[idx] def get_embeddings_by_names(self) -> Dict: """Get all embeddings by names. Returns: Dict: all the embeddings of each speaker. """ embeddings_by_names = {} for x in self.embeddings.values(): if x["name"] not in embeddings_by_names.keys(): embeddings_by_names[x["name"]] = [x["embedding"]] else: embeddings_by_names[x["name"]].append(x["embedding"]) return embeddings_by_names def get_mean_embedding(self, idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean embedding of a idx. Args: idx (str): Target name. num_samples (int, optional): Number of samples to be averaged. Defaults to None. randomize (bool, optional): Pick random `num_samples` of embeddings. Defaults to False. Returns: np.ndarray: Mean embedding. """ embeddings = self.get_embeddings_by_name(idx) if num_samples is None: embeddings = np.stack(embeddings).mean(0) else: assert len( embeddings ) >= num_samples, f" [!] {idx} has number of samples < {num_samples}" if randomize: embeddings = np.stack(random.choices(embeddings, k=num_samples)).mean(0) else: embeddings = np.stack(embeddings[:num_samples]).mean(0) return embeddings def get_random_embedding(self) -> Any: """Get a random embedding. Args: Returns: np.ndarray: embedding. """ if self.embeddings: return self.embeddings[random.choices(list( self.embeddings.keys()))[0]]["embedding"] return None def get_clips(self) -> List: return sorted(self.embeddings.keys()) def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None: """Initialize a speaker encoder model. Args: model_path (str): Model file path. config_path (str): Model config file path. use_cuda (bool, optional): Use CUDA. Defaults to False. """ self.use_cuda = use_cuda self.encoder_config = load_config(config_path) self.encoder = setup_encoder_model(self.encoder_config) self.encoder_criterion = self.encoder.load_checkpoint( self.encoder_config, model_path, eval=True, use_cuda=use_cuda) self.encoder_ap = AudioProcessor(**self.encoder_config.audio) def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list: """Compute a embedding from a given audio file. Args: wav_file (Union[str, List[str]]): Target file path. Returns: list: Computed embedding. """ def _compute(wav_file: str): waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate) if not self.encoder_config.model_params.get( "use_torch_spec", False): m_input = self.encoder_ap.melspectrogram(waveform) m_input = torch.from_numpy(m_input) else: m_input = torch.from_numpy(waveform) if self.use_cuda: m_input = m_input.cuda() m_input = m_input.unsqueeze(0) embedding = self.encoder.compute_embedding(m_input) return embedding if isinstance(wav_file, list): # compute the mean embedding embeddings = None for wf in wav_file: embedding = _compute(wf) if embeddings is None: embeddings = embedding else: embeddings += embedding return (embeddings / len(wav_file))[0].tolist() embedding = _compute(wav_file) return embedding[0].tolist() def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List: """Compute embedding from features. Args: feats (Union[torch.Tensor, np.ndarray]): Input features. Returns: List: computed embedding. """ if isinstance(feats, np.ndarray): feats = torch.from_numpy(feats) if feats.ndim == 2: feats = feats.unsqueeze(0) if self.use_cuda: feats = feats.cuda() return self.encoder.compute_embedding(feats)
#print(f'wav_file: {wav_file}') if os.path.exists(wav_file): wav_files.append(wav_file) print(f'Count of wavs imported: {len(wav_files)}') else: # Parse all wav files in data_path wav_path = data_path wav_files = glob.glob(data_path + '/**/*.wav', recursive=True) output_files = [ wav_file.replace(wav_path, args.output_path).replace('.wav', '.npy') for wav_file in wav_files ] for output_file in output_files: os.makedirs(os.path.dirname(output_file), exist_ok=True) model = SpeakerEncoder(**c.model) model.load_state_dict(torch.load(args.model_path)['model']) model.eval() if args.use_cuda: model.cuda() for idx, wav_file in enumerate(tqdm(wav_files)): mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T mel_spec = torch.FloatTensor(mel_spec[None, :, :]) if args.use_cuda: mel_spec = mel_spec.cuda() embedd = model.compute_embedding(mel_spec) np.save(output_files[idx], embedd.detach().cpu().numpy())
model = setup_model(c) model.load_state_dict(torch.load(args.model_path)["model"]) model.eval() if args.use_cuda: model.cuda() # compute speaker embeddings speaker_mapping = {} for idx, wav_file in enumerate(tqdm(wav_files)): if isinstance(wav_file, list): speaker_name = wav_file[2] wav_file = wav_file[1] else: speaker_name = None mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T mel_spec = torch.FloatTensor(mel_spec[None, :, :]) if args.use_cuda: mel_spec = mel_spec.cuda() embedd = model.compute_embedding(mel_spec) embedd = embedd.detach().cpu().numpy() # create speaker_mapping if target dataset is defined wav_file_name = os.path.basename(wav_file) speaker_mapping[wav_file_name] = {} speaker_mapping[wav_file_name]["name"] = speaker_name speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist() if speaker_mapping: # save speaker_mapping if target dataset is defined if ".json" not in args.output_path:
class LJSpeechDataset(Dataset): def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power): self.frames = pd.read_csv(csv_file, sep='|', header=None) self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) def load_wav(self, filename): try: audio = librosa.core.load(filename, sr=self.sample_rate) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) def __len__(self): return len(self.frames) def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.frames.ix[idx, 0]) + '.wav' text = self.frames.ix[idx, 1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames.ix[idx, 0]} return sample def get_dummy_data(self): return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor) def collate_fn(self, batch): # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): keys = list() wav = [d['wav'] for d in batch] item_idxs = [d['item_idx'] for d in batch] text = [d['text'] for d in batch] text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) linear = np.array( [self.ap.spectrogram(w).astype('float32') for w in wav]) mel = np.array( [self.ap.melspectrogram(w).astype('float32') for w in wav]) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] # PAD with zeros that can be divided by outputs per step if (timesteps + 1) % self.outputs_per_step != 0: pad_len = self.outputs_per_step - \ ((timesteps + 1) % self.outputs_per_step) pad_len += 1 else: pad_len = 1 linear = pad_per_step(linear, pad_len) mel = pad_per_step(mel, pad_len) # reshape jombo linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) linear = torch.FloatTensor(linear) mel = torch.FloatTensor(mel) return text, text_lenghts, linear, mel, item_idxs[0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0]))))