def data_generator(data_base,chunk_length_in_sec,label_order_list,batch_size): stft = TacotronSTFT(**SFT_CONFIG) keys = data_base.get_db_keys() times = data_base.get_db_wv_times(keys[0]) batch_x = [] batch_y = [] while True: for k in keys: label = np.array([label_order_list.index(k)]) sampling_rate, speech = data_base.get_wav(keys[0],*times[0]) chunks = int(len(speech)/sampling_rate/chunk_length_in_sec) audio_length = sampling_rate*chunk_length_in_sec for chunk in range(chunks): audio = speech[chunk*audio_length:(chunk+1)*audio_length] audio_norm = audio / MAX_WAV_VALUE audio_norm = torch.from_numpy(audio_norm).float() audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) mel_np = melspec.detach().numpy() for i in range(mel_np.shape[1]): channel_mean = np.mean(mel_np[0,i,:]) mel_np[0,i,:] = mel_np[0,i,:] - channel_mean #normalized_mel = torch.from_numpy(mel_np) batch_x.append(mel_np) batch_y.append(label) #yield normalized_mel.unsqueeze(1), Variable(y_tensor) if len(batch_x) >= batch_size: x = torch.from_numpy(np.array(batch_x)) y = Variable(torch.from_numpy(np.concatenate(batch_y)).long()) batch_x = [] batch_y = [] yield x,y
class Mel2SampOnehot(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, mu_quantization, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): audio_files = utils.files_to_list(training_files) self.audio_files = audio_files random.seed(1234) random.shuffle(self.audio_files) mel_fmax = None if mel_fmax == -1 else mel_fmax self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.mu_quantization = mu_quantization self.sampling_rate = sampling_rate def get_mel(self, audio): audio_norm = audio / utils.MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = utils.load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, self.mu_quantization) return (mel, audio) def __len__(self): return len(self.audio_files)
class Tacotron(AudioProcessing): """Preprocesses audio as in the Tacotron2 code.""" def __init__( self, sampling_rate, n_mel_channels, filter_length=1024, hop_length=256, win_length=1024, mel_fmin=0.0, mel_fmax=8000.0, ): super(Tacotron, self).__init__(sampling_rate, n_mel_channels) self.taco_stft = TacotronSTFT( filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, n_mel_channels=n_mel_channels, mel_fmin=mel_fmin, mel_fmax=mel_fmax, ) def audio_to_mel(self, audio): audio = torch.tensor(audio) audio = audio.unsqueeze(0) melspec = self.taco_stft.mel_spectrogram(audio) melspec = torch.squeeze(melspec, 0) return melspec.T def mel_to_audio(self, mel): # TODO make it work in batch mode mel = mel.unsqueeze(0) mel_decompress = self.taco_stft.spectral_de_normalize(mel) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling GRIFFIN_ITERS = 60 audio = griffin_lim( spec_from_mel[:, :, :-1], self.taco_stft.stft_fn, GRIFFIN_ITERS, ) audio = audio.squeeze() audio = audio.cpu().numpy() return audio
class Get_mel(): def __init__(self, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec
def data_generator(data_base, chunk_length_in_sec, label_order_list): stft = TacotronSTFT(**SFT_CONFIG) keys = data_base.get_db_keys() batch_train_x = [] batch_train_y = [] while True: for k in keys: label = np.array([label_order_list.index(k)]) for t in data_base.get_db_wv_times(k): sampling_rate, speech = data_base.get_wav(k, *t) chunks = int(len(speech) / sampling_rate / chunk_length_in_sec) audio_length = sampling_rate * chunk_length_in_sec for chunk in range(chunks): audio = speech[chunk * audio_length:(chunk + 1) * audio_length] audio_norm = audio / MAX_WAV_VALUE audio_norm = torch.from_numpy(audio_norm).float() audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) mel_np = melspec.detach().numpy() for i in range(mel_np.shape[1]): channel_mean = np.mean(mel_np[0, i, :]) mel_np[0, i, :] = mel_np[0, i, :] - channel_mean batch_train_x.append(mel_np) batch_train_y.append(label) if len(batch_train_x) > 0 and len(batch_train_y) > 0: yield np.array(batch_train_x), np.concatenate( np.array(batch_train_y)) else: yield np.array([]), np.array([]) batch_train_x = [] batch_train_y = [] yield None, None
class Mel2SampWaveglow(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate def get_mel(self, filepath): audio, sr = load_wav_to_torch(filepath) audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, validation_files, validation_windows, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, load_mel_from_disk, preempthasis): self.audio_files = load_filepaths_and_text(training_files) print("Files before checking: ", len(self.audio_files)) i = 0 i_offset = 0 for i_ in range(len(self.audio_files)): i = i_ + i_offset if i == len(self.audio_files): break file = self.audio_files[i] if not os.path.exists(file[0]): print(file[0], "does not exist") self.audio_files.remove(file) i_offset -= 1 continue audio_data, sample_r = load_wav_to_torch(file[0]) if audio_data.size(0) <= segment_length: print(file[0], "is too short") self.audio_files.remove(file) i_offset -= 1 continue print("Files after checking: ", len(self.audio_files)) self.load_mel_from_disk = load_mel_from_disk self.speaker_ids = self.create_speaker_lookup_table(self.audio_files) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, n_mel_channels=160, mel_fmin=mel_fmin, mel_fmax=mel_fmax) if preempthasis: self.preempthasise = PreEmphasis(preempthasis) self.segment_length = segment_length self.sampling_rate = sampling_rate self.hop_length = hop_length self.win_length = win_length def create_speaker_lookup_table(self, audiopaths_and_text): speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text])) d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))} return d def get_speaker_id(self, speaker_id): return torch.IntTensor([self.speaker_ids[int(speaker_id)]]) def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm).squeeze(0) return melspec def get_segment(self, audio, mel, segment_length, hop_length, n_mel_channels=160): mel_segment_length = int(segment_length / hop_length) # 8400/600 = 14 if audio.size(0) >= segment_length: max_mel_start = int( (audio.size(0) - segment_length) / hop_length) # audio.size(0)%self.hop_length is the remainder mel_start = random.randint(0, max_mel_start) audio_start = mel_start * hop_length audio = audio[audio_start:audio_start + segment_length] mel = mel[:, mel_start:mel_start + mel_segment_length] else: mel_start = 0 n_mel_channels = 160 # TODO take from config file len_pad = int((segment_length / hop_length) - mel.shape[1]) pad = np.ones( (n_mel_channels, len_pad), dtype=np.float32) * -11.512925 mel = np.append(mel, pad, axis=1) audio = torch.nn.functional.pad( audio, (0, segment_length - audio.size(0)), 'constant').data return audio, mel, mel_start, mel_start + mel_segment_length def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = load_wav_to_torch(filename[0]) assert audio.shape[ 0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}" if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) if (self.load_mel_from_disk): # Take segment mel = np.load(filename[1]) assert self.segment_length % self.hop_length == 0, 'self.segment_length must be n times of self.hop_length' # if (mel.shape[1] > ceil(len(audio)/self.hop_length)): # print('mel is longer than audio file') # print('path', filename[1], '\nmel_length', mel.shape[1], '\naudio length', len(audio), '\naudio_hops', ceil(len(audio)/self.hop_length)) # raise Exception # if (mel.shape[1] < ceil(len(audio)/self.hop_length)): # print('mel is shorter than audio file') # print('path', filename[1], '\nmel_length', mel.shape[1], '\naudio length', len(audio), '\naudio_hops', ceil(len(audio)/self.hop_length)) # raise Exception loop = 0 while True: audio_, mel_, start_step, stop_step = self.get_segment( audio, mel, self.segment_length, self.hop_length) # get random segment of audio file std = torch.std(audio_) if std > 250: break # if sample is not silent, use sample for WaveGlow. loop += 1 if loop > 20: print("No Silent Sample Found, filename:", filename[0]) break #print(f"STD: {std} Loops: {loop}") audio, mel = audio_, mel_ mel = torch.from_numpy(mel).float() else: # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length std = 9e9 loop = 0 while True: audio_start = random.randint(0, max_audio_start) audio_segment = audio[audio_start:audio_start + self.segment_length] std = torch.std(audio_segment) if std > 250: break # if sample is not silent, use sample for WaveGlow. loop += 1 if loop > 20: print("No Silent Sample Found, filename:", filename[0]) break audio = audio_ else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data assert audio.shape[ 0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}" mel = self.get_mel(audio) # generate mel from audio segment audio = audio / MAX_WAV_VALUE if hasattr(self, 'preempthasise'): audio = self.preempthasise( audio.unsqueeze(0).unsqueeze(0)).squeeze() speaker_id = self.get_speaker_id(filename[2]) #mel = (mel+5.2)*0.5 # shift values between approx -4 and 4 return (mel, audio, speaker_id) # (mel, audio, speaker_id) def __len__(self): return len(self.audio_files)
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.audio_files = files_to_list(training_files) #过滤短音频 # i = 0 # for file in files_to_list(training_files): # audio_data, sample_r = load_wav_to_torch(file) # if audio_data.size(0) < segment_length: # i += 1 # print(file) # self.audio_files.remove(file) # print("{} files shorter than segment_len".format(i)) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: print("Warning: short wav") audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return len(self.audio_files)
class Mel2SampSplit(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.audio_files = files_to_list(training_files) random.seed(1234) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate self.dataset = self.pack() def pack(self): timings = np.zeros(len(self.audio_files), dtype= np.int32) PAD = 350 assert(self.sampling_rate % PAD == 0) for i,file in enumerate(self.audio_files): audio, sampling_rate = load_wav_to_torch(file) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) t = audio.size(0) t2 = t + t % PAD timings[i] = t2 segment_len = self.sampling_rate total_time = timings.sum() n_data = int(total_time // segment_len) if total_time % segment_len == 0 else int((total_time // segment_len) + 1) ##import pdb; pdb.set_trace() dataset = torch.zeros([ n_data,segment_len], dtype=torch.float32 ) ## all data will be here offset = 0 cur = 0 for i,file in enumerate(self.audio_files): audio, _ = load_wav_to_torch(file) audio = torch.nn.functional.pad(audio, (0, timings[i] - audio.size(0)), 'constant').data assert(timings[i] == audio.size(0)) data_left = audio.size(0) data_offset = 0 space = segment_len - offset while (data_left >= space): ## fill the next data segment to the end dataset.data[cur,offset:offset+space] = audio[data_offset:data_offset+space] data_left = data_left - space data_offset = data_offset + space offset = 0 space = segment_len cur = cur + 1 ## append whats left in the next data segement if data_left > 0: new_offset = offset + data_left dataset.data[cur,offset:new_offset] = audio[data_offset:] offset = new_offset return dataset def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio audio = self.dataset.data[index,:] mel = self.get_mel(audio) audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return self.dataset.size(0)
class Mel2Samp2(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.audio_files = files_to_list(training_files) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate self.everything = self.pack() self.max_time = self.segment_length best = -1 score = 0.0 if self.max_time == 0: ##auto configuration for maximum efficiency for x in range(250000, 1000000,10000): self.max_time = x self.do_binpacking() utilized = np.asarray(self.volumes).mean()/self.max_time if utilized > score: score = utilized best= x self.max_time = best self.do_binpacking() ##import pdb; pdb.set_trace() perm = list(range(len(self.balancer))) random.shuffle(perm) self.volumes = [self.volumes[p] for p in perm ] self.balancer = [self.balancer[p] for p in perm ] def pack(self): for file in self.audio_files: audio, sampling_rate = load_wav_to_torch(file) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) timings.append(audio.size(0)) def get_timings(self): timings = np.zeros(len(self.audio_files)) for file in self.audio_files: audio, sampling_rate = load_wav_to_torch(file) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) timings.append(audio.size(0)) def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio print(index) idxs = self.balancer[index] time = self.volumes[index] pad = (self.max_time - time) // (len(idxs)- 1) print(pad) print(time) print(idxs) audios = [] for k,idx in enumerate(idxs): filename = self.audio_files[idx] audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) print("before pad %d: %s" % ( idx ,audio.shape)) if k != len(idxs) - 1: audio = torch.nn.functional.pad(audio, (0, pad), 'constant').data print("after pad %d: %s" % ( idx ,audio.shape)) audios.append(audio) audio = torch.cat(audios) print("after cat: %s" % audio.shape) if audio.size(0) < self.max_time: audio = torch.nn.functional.pad(audio, (0, self.max_time- audio.size(0)), 'constant').data print("after last pad: %s" % audio.shape) mel = self.get_mel(audio) audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return len(self.balancer)
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, load_mel_from_disk=False): self.load_mel_from_disk = load_mel_from_disk self.hop_length = hop_length self.audio_files = audiopaths_and_melpaths( training_files) if self.load_mel_from_disk else files_to_list( training_files) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def get_mel_from_file(self, mel_path): melspec = np.load(mel_path) melspec = torch.autograd.Variable(melspec, requires_grad=False) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = load_wav_to_torch(filename[0]) if self.load_mel_from_disk \ else load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) if (self.load_mel_from_disk): # Take segment mel = np.load(filename[1]) assert self.segment_length % self.hop_length == 0, 'self.segment_length must be n times of self.hop_length' max_mel_length = int(self.segment_length / self.hop_length) audio_ = audio.data.cpu().numpy() if (mel.shape[1] > len(audio_) / self.hop_length): #handling error diff = int(mel.shape[1] - len(audio_) / self.hop_length) mel = mel[:, :-diff] if (mel.shape[1] < len(audio_) / self.hop_length): print(filename, mel.shape, len(audio)) if audio.size(0) >= self.segment_length: max_mel_start = int( (audio.size(0) - self.segment_length) / self.hop_length ) # audio.size(0)%self.hop_length is the remainder mel_start = random.randint(0, max_mel_start) audio_start = mel_start * self.hop_length audio = audio[audio_start:audio_start + self.segment_length] mel = mel[:, mel_start:mel_start + max_mel_length] else: len_pad = int((self.segment_length / self.hop_length) - mel.shape[1]) pad = np.ones((80, len_pad), dtype=np.float32) * -11.512925 mel = np.append(mel, pad, axis=1) audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = torch.from_numpy(mel).float() audio = audio / MAX_WAV_VALUE # if(mel.shape[1] != int(self.segment_length/self.hop_length)): # print() else: # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) # audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return len(self.audio_files)
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, data_folder, audio_format, return_stft=False): self.audio_files = files_to_list(training_files) random.seed(1234) random.shuffle(self.audio_files) self.return_stft = return_stft if self.return_stft: self.stft = STFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length) else: self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=0.0, mel_fmax=8000.0) self.segment_length = segment_length self.sampling_rate = sampling_rate self.data_folder = data_folder self.audio_format = audio_format def get_stft(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) if self.return_stft: magnitudes, phases = self.stft.transform(audio_norm) magnitudes = dynamic_range_compression(magnitudes) magnitudes = torch.squeeze(magnitudes, 0) return magnitudes else: melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio filename = self.audio_files[index] filename = os.path.join(self.data_folder, filename) audio, sampling_rate = load_wav_to_torch(filename, self.audio_format) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data print('{} - NOT ENOUGH FRAMES'.format(filename)) stft = self.get_stft(audio) audio = audio / MAX_WAV_VALUE return (stft, audio) def __len__(self): return len(self.audio_files)
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, num_workers, use_multi_speaker, speaker_embedding_path, use_speaker_embedding_model): self.audio_files = files_to_list(training_files) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate self.num_workers = num_workers self.use_multi_speaker = use_multi_speaker self.speaker_embedding_path = speaker_embedding_path self.use_speaker_embedding_model = use_speaker_embedding_model if not self.use_speaker_embedding_model: self.spk_id_map = pickle.load(open(self.speaker_embedding_path, "rb")) def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def get_item(self, index): # Read audio filename = self.audio_files[index] # filename = os.path.join(self.npy_dir, os.path.basename(filename) + ".npy") filename = filename + ".npy" audio = np.load(filename) audio = torch.from_numpy(audio).float() # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) # todo: check whether get side effect to result quality audio = audio / MAX_WAV_VALUE if self.use_multi_speaker: if self.use_speaker_embedding_model: speaker_embedding_path = os.path.join(self.speaker_embedding_path, os.path.basename(self.audio_files[index]) + ".npy") if not os.path.isfile(speaker_embedding_path): print("nothing spk embed", speaker_embedding_path) raise Exception("nothing spk embed", speaker_embedding_path) speaker_embedding = self.get_speaker_embedding(speaker_embedding_path) else: spk_file_name = os.path.splitext(os.path.basename(self.audio_files[index]))[0] if spk_file_name not in self.spk_id_map: print("nothing spk embed id", spk_file_name) raise Exception("nothing spk embed id", spk_file_name) speaker_embedding = self.spk_id_map[spk_file_name] return (mel, audio, speaker_embedding) else: return (mel, audio) def get_speaker_embedding(self, filename): speaker_embedding_np = np.load(filename) speaker_embedding_np = torch.autograd.Variable(torch.FloatTensor(speaker_embedding_np.astype(np.float32)), requires_grad=False) # speaker_embedding_np = speaker_embedding_np.half() if self.is_fp16 else speaker_embedding_np return speaker_embedding_np def __getitem__(self, index): # Read audio while True: try: return self.get_item(index) except: index = random.randint(0, len(self.audio_files) - 1) def __len__(self): return len(self.audio_files)
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, debug=False): self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate self.debug = debug valid_files = [] paths = files_to_list(training_files) for path in paths: dur = duration(path) if dur >= self.segment_length: valid_files.append(path) self.audio_files = valid_files def get_mel(self, audio): audio = audio.unsqueeze(0) audio = torch.autograd.Variable(audio, requires_grad=False) melspec = self.stft.mel_spectrogram(audio) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio filename = self.audio_files[index] sampling_rate = sample_rate(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) if self.debug: print('Mel2Samp load: %d %s' % (index, filename)) dur = duration(filename) # Take segment if dur >= self.segment_length: max_audio_start = dur - self.segment_length audio_start = random.randint(0, max_audio_start) audio = load_wav_to_torch(filename, start_sample=audio_start, end_sample=(audio_start + self.segment_length)) else: audio = load_wav_to_torch(filename, start_sample=0, end_sample=dur) audio = torch.nn.functional.pad(audio, (0, self.segment_length - dur), 'constant').data mel = self.get_mel(audio) if self.debug: print('Mel2Samp done: %d %s' % (index, filename)) return (mel, audio) def __len__(self): return len(self.audio_files)
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, validation_files, validation_windows, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, load_mel_from_disk, preempthasis, check_files=False): self.audio_files = load_filepaths_and_text(training_files) if check_files: print("Files before checking: ", len(self.audio_files)) if True: # list comp non-verbose # filter audio files that don't exist self.audio_files = [ x for x in self.audio_files if os.path.exists(x[0]) ] assert len(self.audio_files), "self.audio_files is empty" # filter spectrograms that don't exist if load_mel_from_disk > 0.0: self.audio_files = [ x for x in self.audio_files if os.path.exists(x[1]) ] assert len(self.audio_files), "self.audio_files is empty" # filter audio files that are too short self.audio_files = [ x for x in self.audio_files if (os.stat(x[0]).st_size // 2) >= segment_length ] assert len(self.audio_files), "self.audio_files is empty" else: # forloop with verbose support i = 0 i_offset = 0 for i_ in range(len(self.audio_files)): i = i_ + i_offset if i == len(self.audio_files): break file = self.audio_files[i] if not os.path.exists( file[0]): # check if audio file exists print(f"'{file[0]}' does not exist") self.audio_files.remove(file) i_offset -= 1 continue if load_mel_from_disk > 0.0 and not os.path.exists( file[1]): # check if mel exists print(f"'{file[1]}' does not exist") self.audio_files.remove(file) i_offset -= 1 continue if 1: # performant mode if bitdepth is already known bitdepth = 2 size = os.stat(file[0]).st_size duration = size // bitdepth #duration in samples if duration <= segment_length: # check if audio file is shorter than segment_length #print(f"'{file[0]}' is too short") self.audio_files.remove(file) i_offset -= 1 continue else: audio_data, sample_r, *_ = load_wav_to_torch(file[0]) if audio_data.size( 0 ) <= segment_length: # check if audio file is shorter than segment_length print(f"'{file[0]}' is too short") self.audio_files.remove(file) i_offset -= 1 continue print("Files after checking: ", len(self.audio_files)) self.load_mel_from_disk = load_mel_from_disk self.speaker_ids = self.create_speaker_lookup_table(self.audio_files) # Apply weighting to MLP Datasets duplicated_audiopaths = [ x for x in self.audio_files if "SlicedDialogue" in x[0] ] for i in range(3): self.audio_files.extend(duplicated_audiopaths) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, n_mel_channels=160, mel_fmin=mel_fmin, mel_fmax=mel_fmax) if preempthasis: self.preempthasise = PreEmphasis(preempthasis) self.segment_length = segment_length self.sampling_rate = sampling_rate self.hop_length = hop_length self.win_length = win_length def create_speaker_lookup_table(self, audiopaths_and_text): speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text])) d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))} return d def get_speaker_id(self, speaker_id): """Convert external speaker_id to internel [0 to max_speakers] range speaker_id""" return torch.IntTensor([self.speaker_ids[int(speaker_id)]]) def get_mel(self, audio): """Take audio, normalize [-1 to 1] and convert to spectrogram""" audio_norm = audio / self.MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm).squeeze(0) return melspec def get_segment(self, audio, mel, segment_length, hop_length, n_mel_channels=160): """get audio and mel segment from an already generated spectrogram and audio.""" mel_segment_length = int( segment_length / hop_length) + 1 # 8400/600 + 1 = 15 if audio.size(0) >= segment_length: max_mel_start = int( (audio.size(0) - segment_length) / hop_length) - 1 # mel.size(1) - mel_segment_length mel_start = random.randint(0, max_mel_start) audio_start = mel_start * hop_length audio = audio[audio_start:audio_start + segment_length] mel = mel[:, mel_start:mel_start + mel_segment_length] else: mel_start = 0 n_mel_channels = 160 # TODO take from config file len_pad = int((segment_length / hop_length) - mel.shape[1]) pad = np.ones( (n_mel_channels, len_pad), dtype=np.float32) * -11.512925 mel = np.append(mel, pad, axis=1) audio = torch.nn.functional.pad( audio, (0, segment_length - audio.size(0)), 'constant').data return audio, mel, mel_start, mel_start + mel_segment_length def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate, max_value = load_wav_to_torch(filename[0]) self.MAX_WAV_VALUE = max( max_value, audio.max().item(), -audio.min().item() ) # I'm not sure how, but sometimes the magnitude of audio exceeds the max of the datatype used before casting. assert audio.shape[ 0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}" if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) if random.random( ) < self.load_mel_from_disk: # load_mel_from_disk is now a probability instead of bool. # load mel from disk mel = np.load(filename[1]) # offset the audio if the GTA spectrogram uses an offset if ".mel.npy" in filename[1] or ( ".mel" in filename[1] and ".npy" in filename[1] and filename[1].split(".mel")[1].split(".npy")[0]): offset = int(filename[1].split(".mel")[1].split(".npy")[0]) audio = audio[offset:] #print(f"DEBUG: audio offset success.\nPath = '{filename[1]}'\nOffset = {offset}") assert self.segment_length % self.hop_length == 0, 'self.segment_length must be n times self.hop_length' # Take segment for i in range(20): audio_segment, mel_segment, start_step, stop_step = self.get_segment( audio, mel, self.segment_length, self.hop_length) # get random segment of audio file if torch.std(audio_segment) > ( 0.006103515625 * self.MAX_WAV_VALUE ): # if sample is not silent, use sample for WaveGlow. break else: print("No loud segments found, filename:", filename[0]) audio, mel = audio_segment, mel_segment mel = torch.from_numpy(mel).float() else: # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length std = 9e9 for i in range(20): audio_start = random.randint(0, max_audio_start) audio_segment = audio[audio_start:audio_start + self.segment_length] if torch.std(audio_segment) > (0.006103515625 * self.MAX_WAV_VALUE): break # if sample is not silent, use sample for WaveGlow. else: print("No Loud Sample Found, filename:", filename[0]) audio = audio_segment else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data assert audio.shape[ 0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}" # generate mel from audio segment mel = self.get_mel(audio) # normalize audio [-1 to 1] audio = audio / self.MAX_WAV_VALUE # apply preempthasis to audio signal (if used) if hasattr(self, 'preempthasise'): audio = self.preempthasise( audio.unsqueeze(0).unsqueeze(0)).squeeze() speaker_id = self.get_speaker_id(filename[2]) mel, audio, speaker_id = mel.contiguous(), audio.contiguous( ), speaker_id.contiguous() return (mel, audio, speaker_id) # (mel, audio, speaker_id) def __len__(self): return len(self.audio_files)