Пример #1
0
def collate_vocoder(batch):
    mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad
    max_offsets = [x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch]
    mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
    sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets]

    mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)]

    labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)]

    mels = np.stack(mels).astype(np.float32)
    labels = np.stack(labels).astype(np.int64)

    mels = torch.tensor(mels)
    labels = torch.tensor(labels).long()

    x = labels[:, :hp.voc_seq_len]
    y = labels[:, 1:]

    bits = 16 if hp.voc_mode == 'MOL' else hp.bits

    x = audio.label_2_float(x.float(), bits)

    if hp.voc_mode == 'MOL' :
        y = audio.label_2_float(y.float(), bits)

    return x, y, mels
 def __getitem__(self, index):
     i = random.randrange(1, self.raw_data.shape[1] - self.sample_frames)
     sig = self.wav[int(hparams.hop_size*(i-0.5))-1:int(hparams.hop_size*(i-0.5+self.sample_frames))]
     assert len(sig) == self.sample_frames * hparams.hop_size + 1
     prev = audio.label_2_float(sig[:-1], hparams.bits)
     return torch.Tensor(self.speaker), torch.Tensor(self.raw_data[:,i:i+self.sample_frames].T), \
             torch.Tensor(prev), torch.Tensor(sig[1:]).long()
 def __getitem__(self, index):
     video_length = int(hparams.hop_size / hparams.sample_rate * self.sample_frames * 20)
     # video = np.zeros((video_length, 3, 256, 256))
     video = torch.Tensor(video_length, 3, 128, 128)
     if self.use_256:
         video_large = torch.Tensor(video_length, 3, 256, 256)
     else:
         video_large = torch.Tensor(video_length, 3, 512, 512)
     if not self.ret_wav:
         i = random.randrange(0, (self.raw_data.shape[1] - self.sample_frames + 1) // 32) * 32
         video_index = int(i / 4)
         for j in range(video_length):
             video[j, :, :, :] = self.transform(Image.open(self.list_frame[j + video_index]).convert('RGB'))
             video_large[j, :, :, :] = self.transform_large(Image.open(self.list_frame[j + video_index]).convert('RGB'))
         return torch.Tensor(self.speaker), torch.Tensor(self.raw_data[:,i:i+self.sample_frames].T), video, video_large
     else:
         i = random.randrange(1, (self.raw_data.shape[1] - self.sample_frames) // 32) * 32
         sig = self.wav[int(hparams.hop_size*(i-0.5))-1:int(hparams.hop_size*(i-0.5+self.sample_frames))]
         assert len(sig) == self.sample_frames * hparams.hop_size + 1
         prev = audio.label_2_float(sig[:-1], hparams.bits)
         video_index = int(i / 4)
         for j in range(video_length):
             video[j, :, :, :] = self.transform(Image.open(self.list_frame[j + video_index]).convert('RGB'))
             video_large[j, :, :, :] = self.transform_large(Image.open(self.list_frame[j + video_index]).convert('RGB'))
         assert not (np.isnan(prev).any() or np.isnan(sig).any())
         return torch.Tensor(self.speaker), torch.Tensor(self.raw_data[:,i:i+self.sample_frames].T), \
                torch.Tensor(prev), torch.Tensor(sig[1:]).long(), video, video_large
Пример #4
0
def collate_vocoder(batch):
    mel_win = hp.voc_seq_len // hp.hop_length + 2 * hp.voc_pad

    #max_offsets = [0, x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad) for x in batch]
    max_offsets = [max(1, x[0].shape[-1] -2 - (mel_win + 2 * hp.voc_pad)) for x in batch]
    mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
    sig_offsets = [(offset + hp.voc_pad) * hp.hop_length for offset in mel_offsets]

    #mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] for i, x in enumerate(batch)]
    #labels = [x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1] for i, x in enumerate(batch)]

    mels=[]
    labels=[]
    for i, x in enumerate(batch):
        sliced_mel=x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win]
        if len(sliced_mel[0])<mel_win:
            sliced_mel=np.pad(sliced_mel, [(0, 0), (0, mel_win-len(sliced_mel[0]))], mode='constant', constant_values=-hp.mel_max_abs_value)
            print("padded mel with %f" % -hp.mel_max_abs_value)
            assert len(sliced_mel[0])==mel_win
        mels.append(sliced_mel)

        # additional one for future
        sliced_sig=x[1][sig_offsets[i]:sig_offsets[i] + hp.voc_seq_len + 1]
        if len(sliced_sig)<hp.voc_seq_len+1:
            sliced_sig=np.pad(sliced_sig, (0, hp.voc_seq_len+1-len(sliced_sig)), mode='constant', constant_values=0)
            print("padded seq with 0")
            assert len(sliced_sig)==hp.voc_seq_len+1
        labels.append(sliced_sig)

    mels = np.stack(mels).astype(np.float32)
    labels = np.stack(labels).astype(np.int64)

    mels = torch.tensor(mels)
    labels = torch.tensor(labels).long()

    x = labels[:, :hp.voc_seq_len]
    y = labels[:, 1:]

    bits = 16 if hp.voc_mode == 'MOL' else hp.bits

    x = audio.label_2_float(x.float(), bits)

    if hp.voc_mode == 'MOL' :
        y = audio.label_2_float(y.float(), bits)

    # cur [B, L], future [B, L] bit label, mels [B, D, T]
    return x, y, mels