def collate_fn(self, batch): r""" Perform preprocessing and create a final data batch: 1. Sort batch instances by text-length 2. Convert Audio signal to Spectrograms. 3. PAD sequences wrt r. 4. Load to Torch. """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): text_lenghts = np.array([len(d["text"]) for d in batch]) # sort items with text input length for RNN efficiency text_lenghts, ids_sorted_decreasing = torch.sort( torch.LongTensor(text_lenghts), dim=0, descending=True) wav = [batch[idx]["wav"] for idx in ids_sorted_decreasing] item_idxs = [ batch[idx]["item_idx"] for idx in ids_sorted_decreasing ] text = [batch[idx]["text"] for idx in ids_sorted_decreasing] speaker_name = [ batch[idx]["speaker_name"] for idx in ids_sorted_decreasing ] # get speaker embeddings if self.speaker_mapping is not None: wav_files_names = [ batch[idx]["wav_file_name"] for idx in ids_sorted_decreasing ] speaker_embedding = [ self.speaker_mapping[w]["embedding"] for w in wav_files_names ] else: speaker_embedding = None # compute features mel = [self.ap.melspectrogram(w).astype("float32") for w in wav] mel_lengths = [m.shape[1] for m in mel] # compute 'stop token' targets stop_targets = [ np.array([0.0] * (mel_len - 1) + [1.0]) for mel_len in mel_lengths ] # PAD stop targets stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # PAD sequences with longest instance in the batch text = prepare_data(text).astype(np.int32) # PAD features with longest instance mel = prepare_tensor(mel, self.outputs_per_step) # B x D x T --> B x T x D mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) mel = torch.FloatTensor(mel).contiguous() mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) if speaker_embedding is not None: speaker_embedding = torch.FloatTensor(speaker_embedding) # compute linear spectrogram if self.compute_linear_spec: linear = [ self.ap.spectrogram(w).astype("float32") for w in wav ] linear = prepare_tensor(linear, self.outputs_per_step) linear = linear.transpose(0, 2, 1) assert mel.shape[1] == linear.shape[1] linear = torch.FloatTensor(linear).contiguous() else: linear = None # collate attention alignments if batch[0]["attn"] is not None: attns = [batch[idx]["attn"].T for idx in ids_sorted_decreasing] for idx, attn in enumerate(attns): pad2 = mel.shape[1] - attn.shape[1] pad1 = text.shape[1] - attn.shape[0] attn = np.pad(attn, [[0, pad1], [0, pad2]]) attns[idx] = attn attns = prepare_tensor(attns, self.outputs_per_step) attns = torch.FloatTensor(attns).unsqueeze(1) else: attns = None return ( text, text_lenghts, speaker_name, linear, mel, mel_lengths, stop_targets, item_idxs, speaker_embedding, attns, ) raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0]))))
def collate_fn(self, batch): r""" Perform preprocessing and create a final data batch: 1. Sort batch instances by text-length 2. Convert Audio signal to Spectrograms. 3. PAD sequences wrt r. 4. Load to Torch. """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): text_lenghts = np.array([len(d["text"]) for d in batch]) # sort items with text input length for RNN efficiency text_lenghts, ids_sorted_decreasing = torch.sort( torch.LongTensor(text_lenghts), dim=0, descending=True) wav = [batch[idx]['wav'] for idx in ids_sorted_decreasing] item_idxs = [ batch[idx]['item_idx'] for idx in ids_sorted_decreasing ] text = [batch[idx]['text'] for idx in ids_sorted_decreasing] speaker_name = [ batch[idx]['speaker_name'] for idx in ids_sorted_decreasing ] # compute features mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] for m in mel] # compute 'stop token' targets stop_targets = [ np.array([0.] * (mel_len - 1) + [1.]) for mel_len in mel_lengths ] # PAD stop targets stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # PAD sequences with longest instance in the batch text = prepare_data(text).astype(np.int32) # PAD features with longest instance mel = prepare_tensor(mel, self.outputs_per_step) # B x D x T --> B x T x D mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) mel = torch.FloatTensor(mel).contiguous() mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) # compute linear spectrogram if self.compute_linear_spec: linear = [ self.ap.spectrogram(w).astype('float32') for w in wav ] linear = prepare_tensor(linear, self.outputs_per_step) linear = linear.transpose(0, 2, 1) assert mel.shape[1] == linear.shape[1] linear = torch.FloatTensor(linear).contiguous() else: linear = None return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \ stop_targets, item_idxs raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0]))))
def collate_fn(self, batch): r""" Perform preprocessing and create a final data batch: 1. Sort batch instances by text-length 2. Convert Audio signal to features. 3. PAD sequences wrt r. 4. Load to Torch. """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.abc.Mapping): token_ids_lengths = np.array([len(d["token_ids"]) for d in batch]) # sort items with text input length for RNN efficiency batch, token_ids_lengths, ids_sorted_decreasing = self._sort_batch( batch, token_ids_lengths) # convert list of dicts to dict of lists batch = {k: [dic[k] for dic in batch] for k in batch[0]} # get language ids from language names if self.language_id_mapping is not None: language_ids = [ self.language_id_mapping[ln] for ln in batch["language_name"] ] else: language_ids = None # get pre-computed d-vectors if self.d_vector_mapping is not None: wav_files_names = list(batch["wav_file_name"]) d_vectors = [ self.d_vector_mapping[w]["embedding"] for w in wav_files_names ] else: d_vectors = None # get numerical speaker ids from speaker names if self.speaker_id_mapping: speaker_ids = [ self.speaker_id_mapping[sn] for sn in batch["speaker_name"] ] else: speaker_ids = None # compute features mel = [ self.ap.melspectrogram(w).astype("float32") for w in batch["wav"] ] mel_lengths = [m.shape[1] for m in mel] # lengths adjusted by the reduction factor mel_lengths_adjusted = [ m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step)) if m.shape[1] % self.outputs_per_step else m.shape[1] for m in mel ] # compute 'stop token' targets stop_targets = [ np.array([0.0] * (mel_len - 1) + [1.0]) for mel_len in mel_lengths ] # PAD stop targets stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # PAD sequences with longest instance in the batch token_ids = prepare_data(batch["token_ids"]).astype(np.int32) # PAD features with longest instance mel = prepare_tensor(mel, self.outputs_per_step) # B x D x T --> B x T x D mel = mel.transpose(0, 2, 1) # convert things to pytorch token_ids_lengths = torch.LongTensor(token_ids_lengths) token_ids = torch.LongTensor(token_ids) mel = torch.FloatTensor(mel).contiguous() mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) # speaker vectors if d_vectors is not None: d_vectors = torch.FloatTensor(d_vectors) if speaker_ids is not None: speaker_ids = torch.LongTensor(speaker_ids) if language_ids is not None: language_ids = torch.LongTensor(language_ids) # compute linear spectrogram linear = None if self.compute_linear_spec: linear = [ self.ap.spectrogram(w).astype("float32") for w in batch["wav"] ] linear = prepare_tensor(linear, self.outputs_per_step) linear = linear.transpose(0, 2, 1) assert mel.shape[1] == linear.shape[1] linear = torch.FloatTensor(linear).contiguous() # format waveforms wav_padded = None if self.return_wav: wav_lengths = [w.shape[0] for w in batch["wav"]] max_wav_len = max(mel_lengths_adjusted) * self.ap.hop_length wav_lengths = torch.LongTensor(wav_lengths) wav_padded = torch.zeros(len(batch["wav"]), 1, max_wav_len) for i, w in enumerate(batch["wav"]): mel_length = mel_lengths_adjusted[i] w = np.pad(w, (0, self.ap.hop_length * self.outputs_per_step), mode="edge") w = w[:mel_length * self.ap.hop_length] wav_padded[i, :, :w.shape[0]] = torch.from_numpy(w) wav_padded.transpose_(1, 2) # format F0 if self.compute_f0: pitch = prepare_data(batch["pitch"]) assert mel.shape[1] == pitch.shape[ 1], f"[!] {mel.shape} vs {pitch.shape}" pitch = torch.FloatTensor( pitch)[:, None, :].contiguous() # B x 1 xT else: pitch = None # format attention masks attns = None if batch["attn"][0] is not None: attns = [batch["attn"][idx].T for idx in ids_sorted_decreasing] for idx, attn in enumerate(attns): pad2 = mel.shape[1] - attn.shape[1] pad1 = token_ids.shape[1] - attn.shape[0] assert pad1 >= 0 and pad2 >= 0, f"[!] Negative padding - {pad1} and {pad2}" attn = np.pad(attn, [[0, pad1], [0, pad2]]) attns[idx] = attn attns = prepare_tensor(attns, self.outputs_per_step) attns = torch.FloatTensor(attns).unsqueeze(1) return { "token_id": token_ids, "token_id_lengths": token_ids_lengths, "speaker_names": batch["speaker_name"], "linear": linear, "mel": mel, "mel_lengths": mel_lengths, "stop_targets": stop_targets, "item_idxs": batch["item_idx"], "d_vectors": d_vectors, "speaker_ids": speaker_ids, "attns": attns, "waveform": wav_padded, "raw_text": batch["raw_text"], "pitch": pitch, "language_ids": language_ids, } raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0]))))