def __getitem__(self, index): mel_path, wav_path = self.samples_fpaths[index] # Load the mel spectrogram and adjust its range to [-1, 1] mel = np.load(mel_path).T.astype(np.float32) / hp.mel_max_abs_value # Load the wav wav = np.load(wav_path) if hp.apply_preemphasis: wav = audio.pre_emphasis(wav) wav = np.clip(wav, -1, 1) # Fix for missing padding # TODO: settle on whether this is any useful r_pad = (len(wav) // hp.hop_length + 1) * hp.hop_length - len(wav) wav = np.pad(wav, (0, r_pad), mode='constant') assert len(wav) >= mel.shape[1] * hp.hop_length wav = wav[:mel.shape[1] * hp.hop_length] assert len(wav) % hp.hop_length == 0 # Quantize the wav if hp.voc_mode == 'RAW': if hp.mu_law: quant = audio.encode_mu_law(wav, mu=2**hp.bits) else: quant = audio.float_2_label(wav, bits=hp.bits) elif hp.voc_mode == 'MOL': quant = audio.float_2_label(wav, bits=16) return mel.astype(np.float32), quant.astype(np.int64)
def __init__(self, npy_path=None, wav_path=None, speaker_id=0, speaker_nums=2, sample_frames=128, length=-1): super(SampleDataset, self).__init__() if npy_path is not None: self.raw_data = np.load(npy_path) print('Loading ', npy_path, "\tshape:", self.raw_data.shape) elif wav_path is not None: print('Encoding ', wav_path) wav, sr = librosa.load(wav_path, hparams.sample_rate) wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize) wav = wav / (np.abs(wav).max() * 1.1) self.wav = audio.encode_mu_law(wav, mu=2 ** hparams.bits) mel_basis = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels) linear_spec = np.abs( librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size)) mel_spec = mel_basis.dot(linear_spec) mel_db = 20 * np.log10(mel_spec) # print(in_fpath, mel_db.min(), mel_db.max()) self.raw_data = np.clip((mel_db + 120) / 125, 0, 1) print('Raw_Data Shape:', self.raw_data.shape) # (num_mels, num_frames) else: print("Error! No data input...") self.speaker = np.zeros(speaker_nums) self.speaker[speaker_id % speaker_nums] = 1 self.sample_frames = sample_frames if length > 0: self.length = length else: self.length = max(self.raw_data.shape[1] // sample_frames, 50 * 32)
def __init__(self, video_path, speaker_id=0, speaker_nums=2, sample_frames=128, length=-1, ret_wav=False, use_256=False): super(SampleVideoDataset, self).__init__() data_path, video_name = os.path.split(video_path) folder_path = data_path + '/' + video_name.split('.')[0] + '/' folder = os.path.exists(folder_path) if not folder: print("--- Creating %s... ---" % folder_path) os.makedirs(folder_path) reader = imageio.get_reader(video_path, 'ffmpeg', fps=20) for i, im in enumerate(reader): imageio.imwrite(folder_path + str(i).zfill(5) + '.jpg', im) print("--- OK ---") else: print("--- %s already exists! ---" % folder_path) self.list_frame = glob.glob(folder_path + '*.jpg') self.list_frame.sort() print("--- Totally %d video frames ---" % len(self.list_frame)) # print(int(hparams.hop_size / hparams.sample_rate * self.sample_frames * 20)) print('Encoding Audio of ', video_path) wav, sr = librosa.load(video_path, hparams.sample_rate) if ret_wav: self.wav = audio.encode_mu_law(wav, mu=2 ** hparams.bits) mel_basis = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels) linear_spec = np.abs( librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size)) mel_spec = mel_basis.dot(linear_spec) mel_db = 20 * np.log10(mel_spec) # print(in_fpath, mel_db.min(), mel_db.max()) self.raw_data = np.clip((mel_db + 120) / 125, 0, 1) print('Raw_Data Shape:', self.raw_data.shape) if np.isnan(self.raw_data).any(): print('!!!There exists np.nan in raw_data!!!') # (num_mels, num_frames) self.speaker = np.zeros(speaker_nums) self.speaker[speaker_id] = 1 self.sample_frames = sample_frames if length > 0: self.length = length else: self.length = (max(self.raw_data.shape[1] // sample_frames, 50 * 32) // 8) * 8 self.length = self.length // 16 * 16 self.ret_wav = ret_wav self.transform = transforms.Compose([transforms.Resize(128), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) if use_256: self.transform_large = transforms.Compose([transforms.Resize(256), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) else: self.transform_large = transforms.Compose([transforms.Resize(512), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) self.use_256 = use_256