示例#1
0
    def __getitem__(self, index):
        mel_path, wav_path = self.samples_fpaths[index]

        # Load the mel spectrogram and adjust its range to [-1, 1]
        mel = np.load(mel_path).T.astype(np.float32) / hp.mel_max_abs_value

        # Load the wav
        wav = np.load(wav_path)
        if hp.apply_preemphasis:
            wav = audio.pre_emphasis(wav)
        wav = np.clip(wav, -1, 1)

        # Fix for missing padding   # TODO: settle on whether this is any useful
        r_pad = (len(wav) // hp.hop_length + 1) * hp.hop_length - len(wav)
        wav = np.pad(wav, (0, r_pad), mode='constant')
        assert len(wav) >= mel.shape[1] * hp.hop_length
        wav = wav[:mel.shape[1] * hp.hop_length]
        assert len(wav) % hp.hop_length == 0

        # Quantize the wav
        if hp.voc_mode == 'RAW':
            if hp.mu_law:
                quant = audio.encode_mu_law(wav, mu=2**hp.bits)
            else:
                quant = audio.float_2_label(wav, bits=hp.bits)
        elif hp.voc_mode == 'MOL':
            quant = audio.float_2_label(wav, bits=16)

        return mel.astype(np.float32), quant.astype(np.int64)
    def __init__(self, npy_path=None, wav_path=None, speaker_id=0, speaker_nums=2, sample_frames=128, length=-1):
        super(SampleDataset, self).__init__()
        if npy_path is not None:
            self.raw_data = np.load(npy_path)
            print('Loading ', npy_path, "\tshape:", self.raw_data.shape)

        elif wav_path is not None:
            print('Encoding ', wav_path)
            wav, sr = librosa.load(wav_path, hparams.sample_rate)
            wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
            wav = wav / (np.abs(wav).max() * 1.1)
            self.wav = audio.encode_mu_law(wav, mu=2 ** hparams.bits)
    
            mel_basis = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels)
            linear_spec = np.abs(
                librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size))
            mel_spec = mel_basis.dot(linear_spec)
            mel_db = 20 * np.log10(mel_spec)
            # print(in_fpath, mel_db.min(), mel_db.max())
            self.raw_data = np.clip((mel_db + 120) / 125, 0, 1)
            print('Raw_Data Shape:', self.raw_data.shape)
            # (num_mels, num_frames)
        else:
            print("Error! No data input...")
        self.speaker = np.zeros(speaker_nums)
        self.speaker[speaker_id % speaker_nums] = 1
        self.sample_frames = sample_frames
        if length > 0:
            self.length = length
        else:
            self.length = max(self.raw_data.shape[1] // sample_frames, 50 * 32)
    def __init__(self, video_path, speaker_id=0, speaker_nums=2, sample_frames=128, length=-1, ret_wav=False, use_256=False):
        super(SampleVideoDataset, self).__init__()
        data_path, video_name = os.path.split(video_path)
        folder_path = data_path + '/' + video_name.split('.')[0] + '/'
        folder = os.path.exists(folder_path)

        if not folder:  
            print("---  Creating %s...  ---" % folder_path)
            os.makedirs(folder_path)  
            reader = imageio.get_reader(video_path, 'ffmpeg', fps=20)
            for i, im in enumerate(reader):
                imageio.imwrite(folder_path + str(i).zfill(5) + '.jpg', im)
            print("---  OK  ---")
        else:
            print("---  %s already exists!  ---" % folder_path)

        self.list_frame = glob.glob(folder_path + '*.jpg')
        self.list_frame.sort()
        print("--- Totally %d video frames ---" % len(self.list_frame))

        # print(int(hparams.hop_size / hparams.sample_rate * self.sample_frames * 20))
        print('Encoding Audio of ', video_path)
        wav, sr = librosa.load(video_path, hparams.sample_rate)
        if ret_wav:
            self.wav = audio.encode_mu_law(wav, mu=2 ** hparams.bits)
        mel_basis = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels)
        linear_spec = np.abs(
            librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size))
        mel_spec = mel_basis.dot(linear_spec)
        mel_db = 20 * np.log10(mel_spec)
        # print(in_fpath, mel_db.min(), mel_db.max())
        self.raw_data = np.clip((mel_db + 120) / 125, 0, 1)
        print('Raw_Data Shape:', self.raw_data.shape)
        if np.isnan(self.raw_data).any():
            print('!!!There exists np.nan in raw_data!!!')
        # (num_mels, num_frames)

        self.speaker = np.zeros(speaker_nums)
        self.speaker[speaker_id] = 1
        self.sample_frames = sample_frames
        if length > 0:
            self.length = length
        else:
            self.length = (max(self.raw_data.shape[1] // sample_frames, 50 * 32) // 8) * 8
        self.length = self.length // 16 * 16
        self.ret_wav = ret_wav
        self.transform = transforms.Compose([transforms.Resize(128),
                          transforms.ToTensor(),
                          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
        if use_256:
            self.transform_large = transforms.Compose([transforms.Resize(256),
                          transforms.ToTensor(),
                          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
        else:
            self.transform_large = transforms.Compose([transforms.Resize(512),
                          transforms.ToTensor(),
                          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
        self.use_256 = use_256