def __call__(self, original, sample_rate): """Augment original waveform Parameters ---------- original : `np.ndarray` (n_samples, n_channels) waveform. sample_rate : `int` Sample rate. Returns ------- augmented : `np.ndarray` (n_samples, n_channels) noise-augmented waveform. """ raw_audio = RawAudio(sample_rate=sample_rate, mono=True) # accumulate enough noise to cover duration of original waveform noises = [] len_left = len(original) while len_left > 0: # select noise file at random file = random.choice(self.files_) # select noise segment at random segment = next(random_segment(file['gaps'], weighted=False)) duration = segment.duration segment_len = duration * sample_rate # if noise segment is longer than what is needed, crop it at random if segment_len > len_left: duration = len_left / sample_rate segment = next(random_subsegment(segment, duration)) noise = raw_audio.crop(file, segment, mode='center', fixed=duration) # decrease the `len_left` value by the size of the returned noise len_left -= len(noise) noise = normalize(noise) noises.append(noise) # concatenate # FIXME: use fade-in between concatenated noises noise = np.vstack(noises) # select SNR at random snr = (self.snr_max - self.snr_min) * np.random.random_sample() + self.snr_min alpha = np.exp(-np.log(10) * snr / 20) return normalize(original) + alpha * noise
def __call__(self, original, sample_rate): """Augment original waveform Parameters ---------- original : `np.ndarray` (n_samples, n_channels) waveform. sample_rate : `int` Sample rate. Returns ------- augmented : `np.ndarray` (n_samples, n_channels) noise-augmented waveform. """ raw_audio = RawAudio(sample_rate=sample_rate, mono=True) original_duration = len(original) / sample_rate # accumulate enough noise to cover duration of original waveform noises = [] left = original_duration while left > 0: # select noise file at random file = random.choice(self.files_) duration = file['duration'] # if noise file is longer than what is needed, crop it if duration > left: segment = next(random_subsegment(Segment(0, duration), left)) noise = raw_audio.crop(file, segment, mode='center', fixed=left) left = 0 # otherwise, take the whole file else: noise = raw_audio(file).data left -= duration noise = normalize(noise) noises.append(noise) # concatenate # FIXME: use fade-in between concatenated noises noise = np.vstack(noises) # select SNR at random snr = (self.snr_max - self.snr_min) * np.random.random_sample() + self.snr_min alpha = np.exp(-np.log(10) * snr / 20) return normalize(original) + alpha * noise
def _create_config(self, segment_size_sec: float): return metrics.SpeakerValidationConfig( protocol_name='VoxCeleb.SpeakerVerification.VoxCeleb2', feature_extraction=RawAudio(sample_rate=self.sample_rate), preprocessors={'audio': FileFinder()}, duration=segment_size_sec)