def __call__(self, examples): batch_size = len(examples) mels = [example[0] for example in examples] wavs = [example[1] for example in examples] mels = batch_spec(mels, pad_value=self.padding_value) wavs = batch_wav(wavs, pad_value=self.padding_value) audio_starts = np.zeros((batch_size, ), dtype=np.int64) return mels, wavs, audio_starts
def __call__(self, examples): ids = [example[0] for example in examples] mels = [example[1] for example in examples] stop_probs = [example[2] for example in examples] ids = batch_text_id(ids, pad_id=self.padding_idx) mels = batch_spec(mels, pad_value=self.padding_value) stop_probs = batch_text_id(stop_probs, pad_id=self.padding_idx) return ids, np.transpose(mels, [0, 2, 1]), stop_probs
def __call__(self, examples): mels = [] wavs = [] starts = [] for example in examples: mel, wav_clip, start = self.clip(example) mels.append(mel) wavs.append(wav_clip) starts.append(start) mels = batch_spec(mels) wavs = np.stack(wavs) starts = np.array(starts, dtype=np.int64) return mels, wavs, starts
def __call__(self, examples): texts = [] mels = [] text_lens = [] mel_lens = [] stop_tokens = [] for data in examples: text, mel = data text = np.array(text, dtype=np.int64) text_lens.append(len(text)) mels.append(mel) texts.append(text) mel_lens.append(mel.shape[1]) stop_token = np.zeros([mel.shape[1] - 1], dtype=np.float32) stop_tokens.append(np.append(stop_token, 1.0)) # Sort by text_len in descending order texts = [ i for i, _ in sorted( zip(texts, text_lens), key=lambda x: x[1], reverse=True) ] mels = [ i for i, _ in sorted( zip(mels, text_lens), key=lambda x: x[1], reverse=True) ] mel_lens = [ i for i, _ in sorted( zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True) ] stop_tokens = [ i for i, _ in sorted( zip(stop_tokens, text_lens), key=lambda x: x[1], reverse=True) ] text_lens = sorted(text_lens, reverse=True) # Pad sequence with largest len of the batch texts = batch_text_id(texts, pad_id=self.padding_idx) mels = np.transpose(batch_spec(mels, pad_value=self.padding_value), axes=(0, 2, 1)) stop_tokens = batch_text_id(stop_tokens, pad_id=self.padding_stop_token, dtype=mels[0].dtype) return (texts, mels, text_lens, mel_lens, stop_tokens)
def __call__(self, samples): # transform them first if self.valid: samples = [(audio, mel_spectrogram, 0) for audio, mel_spectrogram in samples] else: samples = [self.random_crop(sample) for sample in samples] # batch them audios = [sample[0] for sample in samples] audio_starts = [sample[2] for sample in samples] mels = [sample[1] for sample in samples] mels = batch_spec(mels) if self.valid: audios = batch_wav(audios, dtype=np.float32) else: audios = np.array(audios, dtype=np.float32) audio_starts = np.array(audio_starts, dtype=np.int64) return audios, mels, audio_starts
def __call__(self, examples): mels = [example[0] for example in examples] wavs = [example[1] for example in examples] mels = batch_spec(mels, pad_value=self.padding_value) wavs = batch_wav(wavs, pad_value=self.padding_value) return mels, wavs