Exemplo n.º 1
0
def data_generator(data_base,chunk_length_in_sec,label_order_list,batch_size):    
    stft = TacotronSTFT(**SFT_CONFIG)
    keys = data_base.get_db_keys()    
    times = data_base.get_db_wv_times(keys[0])
    batch_x = []
    batch_y = []
    while True:        
            for k in keys:                
                label = np.array([label_order_list.index(k)])                 
                sampling_rate, speech = data_base.get_wav(keys[0],*times[0])
                chunks = int(len(speech)/sampling_rate/chunk_length_in_sec)
                audio_length = sampling_rate*chunk_length_in_sec
                for chunk in range(chunks):                
                    audio = speech[chunk*audio_length:(chunk+1)*audio_length]
                    audio_norm = audio / MAX_WAV_VALUE    
                    audio_norm = torch.from_numpy(audio_norm).float()
                    audio_norm = audio_norm.unsqueeze(0)
                    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
                    melspec = stft.mel_spectrogram(audio_norm)
                    mel_np = melspec.detach().numpy()
                    for i in range(mel_np.shape[1]):
                        channel_mean = np.mean(mel_np[0,i,:])  
                        mel_np[0,i,:] = mel_np[0,i,:] - channel_mean
                    
                    #normalized_mel = torch.from_numpy(mel_np)
                    batch_x.append(mel_np)
                    batch_y.append(label)
                    #yield normalized_mel.unsqueeze(1), Variable(y_tensor)
                    if len(batch_x) >= batch_size:
                        x = torch.from_numpy(np.array(batch_x))
                        y = Variable(torch.from_numpy(np.concatenate(batch_y)).long())
                        batch_x = []
                        batch_y = []
                        yield x,y
Exemplo n.º 2
0
class Mel2SampOnehot(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, mu_quantization,
                 filter_length, hop_length, win_length, sampling_rate,
                 mel_fmin, mel_fmax):
        audio_files = utils.files_to_list(training_files)
        self.audio_files = audio_files
        random.seed(1234)
        random.shuffle(self.audio_files)
        mel_fmax = None if mel_fmax == -1 else mel_fmax

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)

        self.segment_length = segment_length
        self.mu_quantization = mu_quantization
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / utils.MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = utils.load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

            # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE,
                                    self.mu_quantization)
        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Exemplo n.º 3
0
class Tacotron(AudioProcessing):
    """Preprocesses audio as in the Tacotron2 code."""
    def __init__(
        self,
        sampling_rate,
        n_mel_channels,
        filter_length=1024,
        hop_length=256,
        win_length=1024,
        mel_fmin=0.0,
        mel_fmax=8000.0,
    ):
        super(Tacotron, self).__init__(sampling_rate, n_mel_channels)
        self.taco_stft = TacotronSTFT(
            filter_length=filter_length,
            hop_length=hop_length,
            win_length=win_length,
            sampling_rate=sampling_rate,
            n_mel_channels=n_mel_channels,
            mel_fmin=mel_fmin,
            mel_fmax=mel_fmax,
        )

    def audio_to_mel(self, audio):
        audio = torch.tensor(audio)
        audio = audio.unsqueeze(0)
        melspec = self.taco_stft.mel_spectrogram(audio)
        melspec = torch.squeeze(melspec, 0)
        return melspec.T

    def mel_to_audio(self, mel):
        # TODO make it work in batch mode
        mel = mel.unsqueeze(0)
        mel_decompress = self.taco_stft.spectral_de_normalize(mel)
        mel_decompress = mel_decompress.transpose(1, 2).data.cpu()

        spec_from_mel_scaling = 1000

        spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis)
        spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
        spec_from_mel = spec_from_mel * spec_from_mel_scaling

        GRIFFIN_ITERS = 60
        audio = griffin_lim(
            spec_from_mel[:, :, :-1],
            self.taco_stft.stft_fn,
            GRIFFIN_ITERS,
        )
        audio = audio.squeeze()
        audio = audio.cpu().numpy()

        return audio
Exemplo n.º 4
0
class Get_mel():
    def __init__(self, filter_length, hop_length, win_length,
                 sampling_rate, mel_fmin, mel_fmax):
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec
def data_generator(data_base, chunk_length_in_sec, label_order_list):
    stft = TacotronSTFT(**SFT_CONFIG)
    keys = data_base.get_db_keys()
    batch_train_x = []
    batch_train_y = []

    while True:
        for k in keys:
            label = np.array([label_order_list.index(k)])
            for t in data_base.get_db_wv_times(k):
                sampling_rate, speech = data_base.get_wav(k, *t)
                chunks = int(len(speech) / sampling_rate / chunk_length_in_sec)
                audio_length = sampling_rate * chunk_length_in_sec
                for chunk in range(chunks):
                    audio = speech[chunk * audio_length:(chunk + 1) *
                                   audio_length]
                    audio_norm = audio / MAX_WAV_VALUE
                    audio_norm = torch.from_numpy(audio_norm).float()
                    audio_norm = audio_norm.unsqueeze(0)
                    audio_norm = torch.autograd.Variable(audio_norm,
                                                         requires_grad=False)
                    melspec = stft.mel_spectrogram(audio_norm)
                    mel_np = melspec.detach().numpy()
                    for i in range(mel_np.shape[1]):
                        channel_mean = np.mean(mel_np[0, i, :])
                        mel_np[0, i, :] = mel_np[0, i, :] - channel_mean

                    batch_train_x.append(mel_np)
                    batch_train_y.append(label)
            if len(batch_train_x) > 0 and len(batch_train_y) > 0:
                yield np.array(batch_train_x), np.concatenate(
                    np.array(batch_train_y))
            else:
                yield np.array([]), np.array([])

            batch_train_x = []
            batch_train_y = []

        yield None, None
Exemplo n.º 6
0
class Mel2SampWaveglow(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, segment_length, filter_length, hop_length, win_length,
                 sampling_rate, mel_fmin, mel_fmax):
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, filepath):
        audio, sr = load_wav_to_torch(filepath)
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec
Exemplo n.º 7
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, validation_files, validation_windows,
                 segment_length, filter_length, hop_length, win_length,
                 sampling_rate, mel_fmin, mel_fmax, load_mel_from_disk,
                 preempthasis):
        self.audio_files = load_filepaths_and_text(training_files)

        print("Files before checking: ", len(self.audio_files))

        i = 0
        i_offset = 0
        for i_ in range(len(self.audio_files)):
            i = i_ + i_offset
            if i == len(self.audio_files): break
            file = self.audio_files[i]
            if not os.path.exists(file[0]):
                print(file[0], "does not exist")
                self.audio_files.remove(file)
                i_offset -= 1
                continue

            audio_data, sample_r = load_wav_to_torch(file[0])
            if audio_data.size(0) <= segment_length:
                print(file[0], "is too short")
                self.audio_files.remove(file)
                i_offset -= 1
                continue

        print("Files after checking: ", len(self.audio_files))

        self.load_mel_from_disk = load_mel_from_disk
        self.speaker_ids = self.create_speaker_lookup_table(self.audio_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 n_mel_channels=160,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        if preempthasis:
            self.preempthasise = PreEmphasis(preempthasis)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.hop_length = hop_length
        self.win_length = win_length

    def create_speaker_lookup_table(self, audiopaths_and_text):
        speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text]))
        d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
        return d

    def get_speaker_id(self, speaker_id):
        return torch.IntTensor([self.speaker_ids[int(speaker_id)]])

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm).squeeze(0)
        return melspec

    def get_segment(self,
                    audio,
                    mel,
                    segment_length,
                    hop_length,
                    n_mel_channels=160):
        mel_segment_length = int(segment_length / hop_length)  # 8400/600 = 14
        if audio.size(0) >= segment_length:
            max_mel_start = int(
                (audio.size(0) - segment_length) /
                hop_length)  # audio.size(0)%self.hop_length is the remainder
            mel_start = random.randint(0, max_mel_start)
            audio_start = mel_start * hop_length
            audio = audio[audio_start:audio_start + segment_length]
            mel = mel[:, mel_start:mel_start + mel_segment_length]
        else:
            mel_start = 0
            n_mel_channels = 160  # TODO take from config file
            len_pad = int((segment_length / hop_length) - mel.shape[1])
            pad = np.ones(
                (n_mel_channels, len_pad), dtype=np.float32) * -11.512925
            mel = np.append(mel, pad, axis=1)
            audio = torch.nn.functional.pad(
                audio, (0, segment_length - audio.size(0)), 'constant').data
        return audio, mel, mel_start, mel_start + mel_segment_length

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename[0])
        assert audio.shape[
            0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}"
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        if (self.load_mel_from_disk):
            # Take segment
            mel = np.load(filename[1])
            assert self.segment_length % self.hop_length == 0, 'self.segment_length must be n times of self.hop_length'
            #            if (mel.shape[1] > ceil(len(audio)/self.hop_length)):
            #                print('mel is longer than audio file')
            #                print('path', filename[1], '\nmel_length', mel.shape[1], '\naudio length', len(audio), '\naudio_hops', ceil(len(audio)/self.hop_length))
            #                raise Exception
            #            if (mel.shape[1] < ceil(len(audio)/self.hop_length)):
            #                print('mel is shorter than audio file')
            #                print('path', filename[1], '\nmel_length', mel.shape[1], '\naudio length', len(audio), '\naudio_hops', ceil(len(audio)/self.hop_length))
            #                raise Exception
            loop = 0
            while True:
                audio_, mel_, start_step, stop_step = self.get_segment(
                    audio, mel, self.segment_length,
                    self.hop_length)  # get random segment of audio file
                std = torch.std(audio_)
                if std > 250:
                    break  # if sample is not silent, use sample for WaveGlow.
                loop += 1
                if loop > 20:
                    print("No Silent Sample Found, filename:", filename[0])
                    break
            #print(f"STD: {std} Loops: {loop}")
            audio, mel = audio_, mel_

            mel = torch.from_numpy(mel).float()
        else:
            # Take segment
            if audio.size(0) >= self.segment_length:
                max_audio_start = audio.size(0) - self.segment_length
                std = 9e9
                loop = 0
                while True:
                    audio_start = random.randint(0, max_audio_start)
                    audio_segment = audio[audio_start:audio_start +
                                          self.segment_length]
                    std = torch.std(audio_segment)
                    if std > 250:
                        break  # if sample is not silent, use sample for WaveGlow.
                    loop += 1
                    if loop > 20:
                        print("No Silent Sample Found, filename:", filename[0])
                        break
                audio = audio_
            else:
                audio = torch.nn.functional.pad(
                    audio, (0, self.segment_length - audio.size(0)),
                    'constant').data
            assert audio.shape[
                0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}"
            mel = self.get_mel(audio)  # generate mel from audio segment

        audio = audio / MAX_WAV_VALUE

        if hasattr(self, 'preempthasise'):
            audio = self.preempthasise(
                audio.unsqueeze(0).unsqueeze(0)).squeeze()

        speaker_id = self.get_speaker_id(filename[2])

        #mel = (mel+5.2)*0.5 # shift values between approx -4 and 4
        return (mel, audio, speaker_id)  # (mel, audio, speaker_id)

    def __len__(self):
        return len(self.audio_files)
Exemplo n.º 8
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        #过滤短音频
        # i = 0
        # for file in files_to_list(training_files):
        #     audio_data, sample_r = load_wav_to_torch(file)

        #     if audio_data.size(0) < segment_length:
        #         i += 1
        #         print(file)
        #         self.audio_files.remove(file)
        # print("{} files shorter than segment_len".format(i))

        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            print("Warning: short wav")
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Exemplo n.º 9
0
class Mel2SampSplit(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.dataset = self.pack()





    def pack(self):
        timings = np.zeros(len(self.audio_files), dtype= np.int32)
        PAD = 350
        assert(self.sampling_rate % PAD == 0)

        for i,file in enumerate(self.audio_files):
            audio, sampling_rate = load_wav_to_torch(file)
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))
            t = audio.size(0)
            t2 = t + t % PAD
            
            timings[i] = t2

        segment_len = self.sampling_rate

        total_time = timings.sum() 
        n_data = int(total_time // segment_len) if total_time % segment_len  == 0 else int((total_time // segment_len) + 1)
        
        ##import pdb; pdb.set_trace()
        
        dataset = torch.zeros([ n_data,segment_len], dtype=torch.float32 ) ## all data will be here
        offset = 0
        cur = 0
        for i,file in enumerate(self.audio_files):
            audio, _ = load_wav_to_torch(file)
            audio = torch.nn.functional.pad(audio, (0, timings[i] - audio.size(0)), 'constant').data
            assert(timings[i]  == audio.size(0))
            data_left =  audio.size(0)
            data_offset = 0
            space = segment_len - offset
            while (data_left >= space): ## fill the next data segment to the end
                dataset.data[cur,offset:offset+space] = audio[data_offset:data_offset+space]
                data_left = data_left - space
                data_offset = data_offset + space
                offset = 0
                space = segment_len
                cur = cur + 1

            ## append whats left in the next data segement
            if data_left > 0:
                new_offset = offset + data_left
                dataset.data[cur,offset:new_offset] = audio[data_offset:]
                offset = new_offset
                
        return dataset

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        audio = self.dataset.data[index,:]
        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE
        return (mel, audio)

    def __len__(self):
        return self.dataset.size(0)
Exemplo n.º 10
0
class Mel2Samp2(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.everything = self.pack()


        self.max_time = self.segment_length

        best = -1
        score = 0.0
        if self.max_time == 0: ##auto configuration for maximum efficiency
            for x in range(250000, 1000000,10000):
                self.max_time = x
                self.do_binpacking()

                utilized =   np.asarray(self.volumes).mean()/self.max_time
                if utilized > score:
                    score = utilized
                    best= x

        self.max_time = best
        self.do_binpacking()

        ##import pdb; pdb.set_trace()
        perm = list(range(len(self.balancer)))
        random.shuffle(perm)
        self.volumes = [self.volumes[p] for p in perm  ]
        self.balancer = [self.balancer[p] for p in perm  ]

        
    def pack(self):
        for file in self.audio_files:
            audio, sampling_rate = load_wav_to_torch(file)
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))
            timings.append(audio.size(0))



    def get_timings(self):
        timings = np.zeros(len(self.audio_files))
        for file in self.audio_files:
            audio, sampling_rate = load_wav_to_torch(file)
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))
            timings.append(audio.size(0))

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        print(index)

        idxs = self.balancer[index]
        time = self.volumes[index]

        pad = (self.max_time - time) // (len(idxs)- 1)
        print(pad)
        print(time)
        print(idxs)
        audios = []
        for k,idx in enumerate(idxs):
            filename = self.audio_files[idx]
            audio, sampling_rate = load_wav_to_torch(filename)
            
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))

            print("before pad %d: %s" % ( idx ,audio.shape))

            if k != len(idxs) - 1:
                audio = torch.nn.functional.pad(audio, (0, pad), 'constant').data
                print("after pad %d: %s" % ( idx ,audio.shape))
            
            audios.append(audio)

        audio = torch.cat(audios)
        print("after cat: %s" % audio.shape)

        if audio.size(0) < self.max_time:
            audio = torch.nn.functional.pad(audio, (0, self.max_time- audio.size(0)), 'constant').data
        print("after last pad: %s" % audio.shape)

        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        return (mel, audio)



    def __len__(self):
        return len(self.balancer)
Exemplo n.º 11
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self,
                 training_files,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 load_mel_from_disk=False):
        self.load_mel_from_disk = load_mel_from_disk
        self.hop_length = hop_length
        self.audio_files = audiopaths_and_melpaths(
            training_files) if self.load_mel_from_disk else files_to_list(
                training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def get_mel_from_file(self, mel_path):
        melspec = np.load(mel_path)
        melspec = torch.autograd.Variable(melspec, requires_grad=False)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename[0]) if self.load_mel_from_disk \
                        else load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        if (self.load_mel_from_disk):
            # Take segment
            mel = np.load(filename[1])
            assert self.segment_length % self.hop_length == 0, 'self.segment_length must be n times of self.hop_length'
            max_mel_length = int(self.segment_length / self.hop_length)
            audio_ = audio.data.cpu().numpy()
            if (mel.shape[1] > len(audio_) / self.hop_length):  #handling error
                diff = int(mel.shape[1] - len(audio_) / self.hop_length)
                mel = mel[:, :-diff]
            if (mel.shape[1] < len(audio_) / self.hop_length):
                print(filename, mel.shape, len(audio))
            if audio.size(0) >= self.segment_length:
                max_mel_start = int(
                    (audio.size(0) - self.segment_length) / self.hop_length
                )  # audio.size(0)%self.hop_length is the remainder
                mel_start = random.randint(0, max_mel_start)
                audio_start = mel_start * self.hop_length
                audio = audio[audio_start:audio_start + self.segment_length]
                mel = mel[:, mel_start:mel_start + max_mel_length]
            else:
                len_pad = int((self.segment_length / self.hop_length) -
                              mel.shape[1])
                pad = np.ones((80, len_pad), dtype=np.float32) * -11.512925
                mel = np.append(mel, pad, axis=1)
                audio = torch.nn.functional.pad(
                    audio, (0, self.segment_length - audio.size(0)),
                    'constant').data

            mel = torch.from_numpy(mel).float()
            audio = audio / MAX_WAV_VALUE
            # if(mel.shape[1] != int(self.segment_length/self.hop_length)):
            #     print()
        else:
            # Take segment
            if audio.size(0) >= self.segment_length:
                max_audio_start = audio.size(0) - self.segment_length
                audio_start = random.randint(0, max_audio_start)
                audio = audio[audio_start:audio_start + self.segment_length]
            else:
                audio = torch.nn.functional.pad(
                    audio, (0, self.segment_length - audio.size(0)),
                    'constant').data

            mel = self.get_mel(audio)  #
            audio = audio / MAX_WAV_VALUE
        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Exemplo n.º 12
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self,
                 training_files,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 data_folder,
                 audio_format,
                 return_stft=False):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.return_stft = return_stft
        if self.return_stft:
            self.stft = STFT(filter_length=filter_length,
                             hop_length=hop_length,
                             win_length=win_length)
        else:
            self.stft = TacotronSTFT(filter_length=filter_length,
                                     hop_length=hop_length,
                                     win_length=win_length,
                                     sampling_rate=sampling_rate,
                                     mel_fmin=0.0,
                                     mel_fmax=8000.0)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.data_folder = data_folder
        self.audio_format = audio_format

    def get_stft(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        if self.return_stft:
            magnitudes, phases = self.stft.transform(audio_norm)
            magnitudes = dynamic_range_compression(magnitudes)
            magnitudes = torch.squeeze(magnitudes, 0)
            return magnitudes
        else:
            melspec = self.stft.mel_spectrogram(audio_norm)
            melspec = torch.squeeze(melspec, 0)
            return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        filename = os.path.join(self.data_folder, filename)
        audio, sampling_rate = load_wav_to_torch(filename, self.audio_format)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data
            print('{} - NOT ENOUGH FRAMES'.format(filename))
        stft = self.get_stft(audio)
        audio = audio / MAX_WAV_VALUE

        return (stft, audio)

    def __len__(self):
        return len(self.audio_files)
Exemplo n.º 13
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """

    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, num_workers,
                 use_multi_speaker, speaker_embedding_path, use_speaker_embedding_model):
        self.audio_files = files_to_list(training_files)

        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.num_workers = num_workers
        self.use_multi_speaker = use_multi_speaker
        self.speaker_embedding_path = speaker_embedding_path
        self.use_speaker_embedding_model = use_speaker_embedding_model
        if not self.use_speaker_embedding_model:
            self.spk_id_map = pickle.load(open(self.speaker_embedding_path, "rb"))

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def get_item(self, index):
        # Read audio
        filename = self.audio_files[index]
        # filename = os.path.join(self.npy_dir, os.path.basename(filename) + ".npy")
        filename = filename + ".npy"

        audio = np.load(filename)

        audio = torch.from_numpy(audio).float()

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data
        mel = self.get_mel(audio)
        # todo: check whether get side effect to result quality
        audio = audio / MAX_WAV_VALUE

        if self.use_multi_speaker:
            if self.use_speaker_embedding_model:
                speaker_embedding_path = os.path.join(self.speaker_embedding_path,
                                                      os.path.basename(self.audio_files[index]) + ".npy")
                if not os.path.isfile(speaker_embedding_path):
                    print("nothing spk embed", speaker_embedding_path)
                    raise Exception("nothing spk embed", speaker_embedding_path)
                speaker_embedding = self.get_speaker_embedding(speaker_embedding_path)
            else:
                spk_file_name = os.path.splitext(os.path.basename(self.audio_files[index]))[0]
                if spk_file_name not in self.spk_id_map:
                    print("nothing spk embed id", spk_file_name)
                    raise Exception("nothing spk embed id", spk_file_name)
                speaker_embedding = self.spk_id_map[spk_file_name]

            return (mel, audio, speaker_embedding)
        else:
            return (mel, audio)

    def get_speaker_embedding(self, filename):
        speaker_embedding_np = np.load(filename)
        speaker_embedding_np = torch.autograd.Variable(torch.FloatTensor(speaker_embedding_np.astype(np.float32)),
                                                       requires_grad=False)
        # speaker_embedding_np = speaker_embedding_np.half() if self.is_fp16 else speaker_embedding_np
        return speaker_embedding_np

    def __getitem__(self, index):
        # Read audio
        while True:
            try:
                return self.get_item(index)
            except:
                index = random.randint(0, len(self.audio_files) - 1)

    def __len__(self):
        return len(self.audio_files)
Exemplo n.º 14
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self,
                 training_files,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 debug=False):
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.debug = debug

        valid_files = []
        paths = files_to_list(training_files)
        for path in paths:
            dur = duration(path)
            if dur >= self.segment_length:
                valid_files.append(path)
        self.audio_files = valid_files

    def get_mel(self, audio):
        audio = audio.unsqueeze(0)
        audio = torch.autograd.Variable(audio, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]

        sampling_rate = sample_rate(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        if self.debug:
            print('Mel2Samp load: %d %s' % (index, filename))

        dur = duration(filename)

        # Take segment
        if dur >= self.segment_length:
            max_audio_start = dur - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = load_wav_to_torch(filename,
                                      start_sample=audio_start,
                                      end_sample=(audio_start +
                                                  self.segment_length))
        else:
            audio = load_wav_to_torch(filename, start_sample=0, end_sample=dur)
            audio = torch.nn.functional.pad(audio,
                                            (0, self.segment_length - dur),
                                            'constant').data

        mel = self.get_mel(audio)

        if self.debug:
            print('Mel2Samp done: %d %s' % (index, filename))

        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Exemplo n.º 15
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self,
                 training_files,
                 validation_files,
                 validation_windows,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 load_mel_from_disk,
                 preempthasis,
                 check_files=False):
        self.audio_files = load_filepaths_and_text(training_files)

        if check_files:
            print("Files before checking: ", len(self.audio_files))
            if True:  # list comp non-verbose
                # filter audio files that don't exist
                self.audio_files = [
                    x for x in self.audio_files if os.path.exists(x[0])
                ]
                assert len(self.audio_files), "self.audio_files is empty"

                # filter spectrograms that don't exist
                if load_mel_from_disk > 0.0:
                    self.audio_files = [
                        x for x in self.audio_files if os.path.exists(x[1])
                    ]
                    assert len(self.audio_files), "self.audio_files is empty"

                # filter audio files that are too short
                self.audio_files = [
                    x for x in self.audio_files
                    if (os.stat(x[0]).st_size // 2) >= segment_length
                ]
                assert len(self.audio_files), "self.audio_files is empty"
            else:  # forloop with verbose support
                i = 0
                i_offset = 0
                for i_ in range(len(self.audio_files)):
                    i = i_ + i_offset
                    if i == len(self.audio_files): break
                    file = self.audio_files[i]

                    if not os.path.exists(
                            file[0]):  # check if audio file exists
                        print(f"'{file[0]}' does not exist")
                        self.audio_files.remove(file)
                        i_offset -= 1
                        continue

                    if load_mel_from_disk > 0.0 and not os.path.exists(
                            file[1]):  # check if mel exists
                        print(f"'{file[1]}' does not exist")
                        self.audio_files.remove(file)
                        i_offset -= 1
                        continue

                    if 1:  # performant mode if bitdepth is already known
                        bitdepth = 2
                        size = os.stat(file[0]).st_size
                        duration = size // bitdepth  #duration in samples
                        if duration <= segment_length:  # check if audio file is shorter than segment_length
                            #print(f"'{file[0]}' is too short")
                            self.audio_files.remove(file)
                            i_offset -= 1
                            continue
                    else:
                        audio_data, sample_r, *_ = load_wav_to_torch(file[0])
                        if audio_data.size(
                                0
                        ) <= segment_length:  # check if audio file is shorter than segment_length
                            print(f"'{file[0]}' is too short")
                            self.audio_files.remove(file)
                            i_offset -= 1
                            continue
            print("Files after checking: ", len(self.audio_files))

        self.load_mel_from_disk = load_mel_from_disk
        self.speaker_ids = self.create_speaker_lookup_table(self.audio_files)

        # Apply weighting to MLP Datasets
        duplicated_audiopaths = [
            x for x in self.audio_files if "SlicedDialogue" in x[0]
        ]
        for i in range(3):
            self.audio_files.extend(duplicated_audiopaths)

        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 n_mel_channels=160,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        if preempthasis:
            self.preempthasise = PreEmphasis(preempthasis)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.hop_length = hop_length
        self.win_length = win_length

    def create_speaker_lookup_table(self, audiopaths_and_text):
        speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text]))
        d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
        return d

    def get_speaker_id(self, speaker_id):
        """Convert external speaker_id to internel [0 to max_speakers] range speaker_id"""
        return torch.IntTensor([self.speaker_ids[int(speaker_id)]])

    def get_mel(self, audio):
        """Take audio, normalize [-1 to 1] and convert to spectrogram"""
        audio_norm = audio / self.MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm).squeeze(0)
        return melspec

    def get_segment(self,
                    audio,
                    mel,
                    segment_length,
                    hop_length,
                    n_mel_channels=160):
        """get audio and mel segment from an already generated spectrogram and audio."""
        mel_segment_length = int(
            segment_length / hop_length) + 1  # 8400/600 + 1 = 15
        if audio.size(0) >= segment_length:
            max_mel_start = int(
                (audio.size(0) - segment_length) /
                hop_length) - 1  # mel.size(1) - mel_segment_length
            mel_start = random.randint(0, max_mel_start)
            audio_start = mel_start * hop_length
            audio = audio[audio_start:audio_start + segment_length]
            mel = mel[:, mel_start:mel_start + mel_segment_length]
        else:
            mel_start = 0
            n_mel_channels = 160  # TODO take from config file
            len_pad = int((segment_length / hop_length) - mel.shape[1])
            pad = np.ones(
                (n_mel_channels, len_pad), dtype=np.float32) * -11.512925
            mel = np.append(mel, pad, axis=1)
            audio = torch.nn.functional.pad(
                audio, (0, segment_length - audio.size(0)), 'constant').data
        return audio, mel, mel_start, mel_start + mel_segment_length

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate, max_value = load_wav_to_torch(filename[0])
        self.MAX_WAV_VALUE = max(
            max_value,
            audio.max().item(), -audio.min().item()
        )  # I'm not sure how, but sometimes the magnitude of audio exceeds the max of the datatype used before casting.
        assert audio.shape[
            0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}"
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        if random.random(
        ) < self.load_mel_from_disk:  # load_mel_from_disk is now a probability instead of bool.
            # load mel from disk
            mel = np.load(filename[1])

            # offset the audio if the GTA spectrogram uses an offset
            if ".mel.npy" in filename[1] or (
                    ".mel" in filename[1] and ".npy" in filename[1]
                    and filename[1].split(".mel")[1].split(".npy")[0]):
                offset = int(filename[1].split(".mel")[1].split(".npy")[0])
                audio = audio[offset:]
                #print(f"DEBUG: audio offset success.\nPath = '{filename[1]}'\nOffset = {offset}")

            assert self.segment_length % self.hop_length == 0, 'self.segment_length must be n times self.hop_length'

            # Take segment
            for i in range(20):
                audio_segment, mel_segment, start_step, stop_step = self.get_segment(
                    audio, mel, self.segment_length,
                    self.hop_length)  # get random segment of audio file
                if torch.std(audio_segment) > (
                        0.006103515625 * self.MAX_WAV_VALUE
                ):  # if sample is not silent, use sample for WaveGlow.
                    break
            else:
                print("No loud segments found, filename:", filename[0])
            audio, mel = audio_segment, mel_segment

            mel = torch.from_numpy(mel).float()
        else:
            # Take segment
            if audio.size(0) >= self.segment_length:
                max_audio_start = audio.size(0) - self.segment_length
                std = 9e9
                for i in range(20):
                    audio_start = random.randint(0, max_audio_start)
                    audio_segment = audio[audio_start:audio_start +
                                          self.segment_length]
                    if torch.std(audio_segment) > (0.006103515625 *
                                                   self.MAX_WAV_VALUE):
                        break  # if sample is not silent, use sample for WaveGlow.
                else:
                    print("No Loud Sample Found, filename:", filename[0])
                audio = audio_segment
            else:
                audio = torch.nn.functional.pad(
                    audio, (0, self.segment_length - audio.size(0)),
                    'constant').data
            assert audio.shape[
                0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}"
            # generate mel from audio segment
            mel = self.get_mel(audio)

        # normalize audio [-1 to 1]
        audio = audio / self.MAX_WAV_VALUE

        # apply preempthasis to audio signal (if used)
        if hasattr(self, 'preempthasise'):
            audio = self.preempthasise(
                audio.unsqueeze(0).unsqueeze(0)).squeeze()

        speaker_id = self.get_speaker_id(filename[2])
        mel, audio, speaker_id = mel.contiguous(), audio.contiguous(
        ), speaker_id.contiguous()
        return (mel, audio, speaker_id)  # (mel, audio, speaker_id)

    def __len__(self):
        return len(self.audio_files)