def apply_augmentation(audio_path:str, preproc_cfg:dict, logger:Logger)\ ->Tuple[np.ndarray, np.ndarray]: logger.info(f"audio_path: {audio_path}") if preproc_cfg['tempo_gain_pitch_perturb']: if np.random.binomial(1, preproc_cfg['tempo_gain_pitch_prob']): aug_data, samp_rate = tempo_gain_pitch_perturb( audio_path, tempo_range=preproc_cfg['tempo_range'], gain_range=preproc_cfg['gain_range'], pitch_range=preproc_cfg['pitch_range'], augment_from_normal=preproc_cfg['augment_from_normal'], logger=logger) else: aug_data, samp_rate = array_from_wave(audio_path) else: aug_data, samp_rate = array_from_wave(audio_path) if preproc_cfg['synthetic_gaussian_noise']: if np.random.binomial(1, preproc_cfg['gauss_noise_prob']): aug_data = synthetic_gaussian_noise_inject( aug_data, preproc_cfg['gauss_snr_db_range'], preproc_cfg['augment_from_normal'], logger=logger) if preproc_cfg['background_noise']: if np.random.binomial(1, preproc_cfg['background_noise_prob']): logger.info("noise injected") aug_data = inject_noise(aug_data, samp_rate, preproc_cfg['background_noise_dir'], preproc_cfg['background_noise_range'], preproc_cfg['augment_from_normal'], logger) else: logger.info("noise not injected") return aug_data, samp_rate
def test_for_nan_values(): """ this test will try a variety of audio files and input parameters to generate nan values """ logging.basicConfig(filename=None, filemode='w', level=10) logger = logging.getLogger("train_log") test_audio = get_nan_audio() params_list = get_nan_parameters() for audio_count, audio_path in enumerate(test_audio): for params_count, params_dict in enumerate(params_list): audio_data, samp_rate = array_from_wave(audio_path) features = log_spectrogram_from_data(audio_data, samp_rate, window_size=32, step_size=16) features = torch.from_numpy(features.T) features = spec_augment( features, time_warping_para=params_dict["W"], frequency_masking_para=params_dict["frequency_masking_para"], time_masking_para=params_dict["time_masking_para"], frequency_mask_num=len(params_dict["f"]), time_mask_num=len(params_dict["t"]), logger=logger, fixed_params=params_dict) features = to_numpy(features) features = features.T # np.isnan returns an array of bools, if one value is true (there is a nan) the sum will not be zero assert np.isnan(features).sum( ) == 0, f"nan value found in audio {audio_count}, params {params_count}"
def test_time_masking(logger: Logger = None): """ Checks that the number of time masks are less than the maximum number allowed. Values of test_tuples are: ('time_warping_para', 'frequency_masking_para', 'time_masking_para' 'frequency_mask_num', 'time_mask_num') """ test_tuples = [ (0, 0, 60, 0, 1), # 1 mask with max width of 60 (0, 0, 30, 0, 2), (0, 0, 20, 0, 3) ] audio_paths = get_all_test_audio() number_of_tests = 10 # multiple tests as mask selection is random for _ in range(number_of_tests): for audio_path in audio_paths: for param_tuple in test_tuples: audio_data, samp_rate = array_from_wave(audio_path) features = log_spectrogram_from_data(audio_data, samp_rate, window_size=32, step_size=16) features = torch.from_numpy(features.T) aug_features = spec_augment(features, *param_tuple) aug_features = to_numpy(aug_features) num_mask_rows = count_time_mask(aug_features) time_mask_size = param_tuple[2] num_time_masks = param_tuple[4] max_time_masks = time_mask_size * num_time_masks #print(f"number of time masked rows: {num_mask_rows}, max_time_masked: {max_time_masks}") assert num_mask_rows <= max_time_masks
def check_audio_with_sox(): """ this test aims to find files where audio_with_sox raises a FileNotFoundError by running audio_with_sox over the entire noise file using different window sizes defined in data_lens """ noise_dataset = dataset_info.NoiseDataset() noise_files = noise_dataset.files_from_pattern() data_lens = [0.5, 5, 50] # in secs step_size = 0.05 print( f"\n Test Full Noise File: testing {len(noise_files)} noise files...") file_count = 0 for noise_file in noise_files: print(f"Processing file {file_count}: {noise_file}") file_count += 1 audio, samp_rate = array_from_wave(noise_file) noise_len = audio.shape[0] / samp_rate for data_len in data_lens: start_end_tups = calc_start_end(noise_len, data_len, step_size) for noise_start, noise_end in start_end_tups: try: noise_dst = audio_with_sox(noise_file, samp_rate, noise_start, noise_end) except AssertionError: raise AssertionError( f"noise:{noise_file}, data_len: {data_len}") except FileNotFoundError: raise FileNotFoundError( f"noise:{noise_file}, data_len: {data_len}") except: raise Exception( f"noise:{noise_file}, data_len: {data_len}")
def test_gain_pitch_same_size(): """ tests that varying the gain and the pitch has no affect on the audio_data size """ tempo = 1.0 gain_pitch_tuples = [ (0, 0), # not augmentation (8, 0), # only gain aug (0, 400), # only pitch (-6, -400) ] # both gain and pitch audio_path = get_all_test_audio()[0] # only using a single audio path for gain, pitch in gain_pitch_tuples: # un-augmented audio_data audio_data, samp_rate = array_from_wave(audio_path) aug_data, samp_rate = tempo_gain_pitch_perturb(audio_path, sample_rate=samp_rate, tempo_range=(tempo, tempo), gain_range=(gain, gain), pitch_range=(pitch, pitch)) assert audio_data.size == aug_data.size, "data size is not the same"
def extend_audio(audio_dir:str, target_duration:int) -> None: """ stacks the audio files in audio_dur on themselves until they are each equal in length to the target_duration (in seconds) Arguments: audio_dir (str): directory of audio files target_duration (int): length in seconds the audio filles will be extended to """ assert os.path.exists(audio_dir) == True, "audio directory does not exist" pattern = os.path.join(audio_dir, "*.wav") audio_files = glob.glob(pattern) for audio_fn in audio_files: audio_duration = wav_duration(audio_fn) if audio_duration < target_duration: data, samp_rate = array_from_wave(audio_fn) # whole_dup as in whole_duplicate whole_dup, frac_dup = divmod(target_duration, audio_duration) output_data = data #loop over whole_duplicates minus one because concatenating onto original for i in range(int(whole_dup)-1): output_data = np.concatenate((output_data, data), axis=0) # adding on the fractional section fraction_index = int(frac_dup*samp_rate) output_data = np.concatenate((output_data, data[:fraction_index])) file_name = os.path.basename(audio_fn) extended_name = file_name[:-4]+ "_extended.wav" extended_dir = os.path.join(os.path.dirname(audio_fn), "extended") if not os.path.exists(extended_dir): os.mkdir(extended_dir) ext_audio_path = os.path.join(extended_dir, extended_name) write(ext_audio_path, samp_rate, output_data)
def compare_log_spec_from_file(audio_file_1: str, audio_file_2: str, plot=False): """ This function takes in two audio paths and calculates the difference between the spectrograms by subtracting them. """ audio_1, sr_1 = array_from_wave(audio_file_1) audio_2, sr_2 = array_from_wave(audio_file_2) if len(audio_1.shape) > 1: audio_1 = audio_1[:, 0] # take the first channel if len(audio_2.shape) > 1: audio_2 = audio_2[:, 0] # take the first channel window_size = 20 step_size = 10 nperseg_1 = int(window_size * sr_1 / 1e3) noverlap_1 = int(step_size * sr_1 / 1e3) nperseg_2 = int(window_size * sr_2 / 1e3) noverlap_2 = int(step_size * sr_2 / 1e3) freq_1, time_1, spec_1 = scipy.signal.spectrogram(audio_1, fs=sr_1, window='hann', nperseg=nperseg_1, noverlap=noverlap_1, detrend=False) freq_2, time_2, spec_2 = scipy.signal.spectrogram(audio_2, fs=sr_2, window='hann', nperseg=nperseg_2, noverlap=noverlap_2, detrend=False) spec_diff = spec_1 - spec_2 freq_diff = freq_1 - freq_2 time_diff = time_1 - time_2 if plot: plot_spectrogram(freq_diff, time_diff, spec_diff) #plot_spectrogram(freq_1, time_1, spec_2) #plot_spectrogram(freq_2, time_2, spec_2) return spec_diff
def test_datatype(): test_audio_paths = get_all_test_audio() snr_level = 30 for audio_path in test_audio_paths: audio_data, sr = array_from_wave(audio_path) augmented_data = synthetic_gaussian_noise_inject(audio_data, snr_range=(snr_level, snr_level)) assert augmented_data.dtype == "int16"
def audio_with_sox(path:str, sample_rate:int, start_time:float, end_time:float, logger=None)\ ->np.ndarray: """ crop and resample the recording with sox and loads it. If the output file cannot be found, an array of zeros of the desired length will be returned. """ use_log = (logger is not None) with NamedTemporaryFile(suffix=".wav") as tar_file: tar_filename = tar_file.name sox_cmd = [ 'sox', '-V3', # verbosity level=3 path, # noise filename '-r', f'{sample_rate}', # sample rate '-c', '1', # output is single-channel audio '-b', '16', # bitrate = 16 '-e', 'si', # encoding = signed-integer '-t', 'wav', # the output file is wav type tar_filename, # output temp-filename 'trim', f'{start_time}', '=' + f'{end_time}' ] # trim to start and end time sox_result = subprocess.run(sox_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if use_log: logger.info( f"noise_inj: sox: sox stdout: {sox_result.stdout.decode('utf-8')}" ) stderr_message = sox_result.stderr.decode('utf-8') if 'FAIL' in stderr_message: logger.error(f"noise_inj: sox: sox stderr: {stderr_message}") print(f"ERROR: noise_inj: sox: sox stderr: {stderr_message}") else: logger.info(f"noise_inj: sox: sox stderr: {stderr_message}") if os.path.exists(tar_filename): noise_data, samp_rate = array_from_wave(tar_filename) else: noise_len = round((end_time - start_time) / sample_rate) noise_data = np.zeros((noise_len, )) logger.error( f"noise_inj: sox: tmp_file doesnt exist, using zeros of len {noise_len}" ) print( f"ERROR: noise_inj: sox: sox stderr: tmp_file doesnt exist, using zeros of len {noise_len}" ) assert isinstance(noise_data, np.ndarray), "not numpy array returned" return noise_data
def augment_audio_with_sox(path: str, sample_rate: int, tempo: float, gain: float, pitch: float, logger=None) -> Tuple[np.ndarray, int]: """ Changes tempo, gain (volume), and pitch of the recording with sox and loads it. """ use_log = (logger is not None) with NamedTemporaryFile(suffix=".wav") as augmented_file: augmented_filename = augmented_file.name sox_cmd = [ 'sox', '-V3', # verbosity level = 3 path, # file to augment '-r', f'{sample_rate}', # sample rate '-c', '1', # single-channel audio '-b', '16', # bitrate = 16 '-e', 'si', # encoding = signed-integer '-t', 'wav', # the output file is wav type augmented_filename, # output temp-filename 'tempo', f'{tempo:.3f}', # augment tempo 'gain', f'{gain:.3f}', # augment gain (in db) 'pitch', f'{pitch:.0f}' ] # augment pitch (in hundredths of semi-tone) sox_result = subprocess.run(sox_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if use_log: logger.info( f"sox_pertrub: aug_audio_sox: tmpfile exists: {os.path.exists(augmented_filename)}" ) logger.info( f"sox_pertrub: aug_audio_sox: sox stdout: {sox_result.stdout.decode('utf-8')}" ) stderr_message = sox_result.stderr.decode('utf-8') if 'FAIL' in stderr_message: logger.error( f"sox_pertrub: aug_audio_sox: sox stderr: {stderr_message}" ) else: logger.info( f"sox_pertrub: aug_audio_sox: sox stderr: {stderr_message}" ) data, samp_rate = array_from_wave(augmented_filename) return data, samp_rate
def load_audio(audio_path, preproc): audio_data, samp_rate = wave.array_from_wave(audio_path) inputs = loader.log_specgram_from_data(audio_data, samp_rate, window_size=32, step_size=16) inputs = (inputs - preproc.mean) / preproc.std return inputs, samp_rate
def tempo_gain_pitch_perturb(audio_path: str, sample_rate: int = 16000, tempo_range: AugmentRange = (0.85, 1.15), gain_range: AugmentRange = (-6.0, 8.0), pitch_range: AugmentRange = (-400, 400), augment_from_normal: bool = False, logger=None) -> Tuple[np.ndarray, int]: """ Picks tempo and gain uniformly, applies it to the utterance by using sox utility. Arguments: augment_from_normal - bool: if true, the augmentation values will be drawn from normal dist Returns: tuple(np.ndarray, int) - the augmente audio data and the sample_rate """ use_log = (logger is not None) if augment_from_normal: tempo_center = np.mean(tempo_range) tempo_value = get_value_from_truncnorm(tempo_center, tempo_range, bounds=tempo_range) gain_center = np.mean(gain_range) gain_value = get_value_from_truncnorm(gain_center, gain_range, bounds=gain_range) pitch_center = np.mean(pitch_range) pitch_value = get_value_from_truncnorm(pitch_center, pitch_range, bounds=pitch_range) else: tempo_value = np.random.uniform(*tempo_range) gain_value = np.random.uniform(*gain_range) pitch_value = np.random.uniform(*pitch_range) if use_log: logger.info(f"tempo_gain_pitch_perturb: audio_file: {audio_path}") if use_log: logger.info(f"tempo_gain_pitch_perturb: tempo_value: {tempo_value}") if use_log: logger.info(f"tempo_gain_pitch_perturb: gain_value: {gain_value}") if use_log: logger.info(f"tempo_gain_pitch_perturb: pitch_value: {pitch_value}") try: audio_data, samp_rate = augment_audio_with_sox(audio_path, sample_rate, tempo_value, gain_value, pitch_value, logger=logger) except RuntimeError as rterr: if use_log: logger.error(f"tempo_gain_pitch_perturb: RuntimeError: {rterr}") audio_data, samp_rate = array_from_wave(audio_path) return audio_data, samp_rate
def resample_with_sox(path, sample_rate): """ resample the recording with sox """ sox_params = "sox \"{}\" -r {} -c 1 -b 16 -e si {} trim {} ={} >/dev/null 2>&1".format(path, sample_rate, tar_filename, start_time, end_time) os.system(sox_params) noise_data, samp_rate = array_from_wave(tar_filename) return noise_data
def signal_augmentations(self, wave_file: str) -> tuple: """ Performs all of the augmtations to the raw audio signal. The audio data is in pcm16 format. Arguments: wave_file - str: the path to the audio sample Returns: audio_data - np.ndarray: augmented np-array samp_rate - int: sample rate of the audio recording """ if self.use_log: self.logger.info(f"preproc: audio_data read: {wave_file}") audio_data, samp_rate = array_from_wave(wave_file) # sox-based tempo, gain, pitch augmentations if self.tempo_gain_pitch_perturb and self.train_status: if np.random.binomial(1, self.tempo_gain_pitch_prob): audio_data, samp_rate = tempo_gain_pitch_perturb( wave_file, samp_rate, self.tempo_range, self.gain_range, self.pitch_range, self.augment_from_normal, logger=self.logger) if self.use_log: self.logger.info(f"preproc: tempo_gain_pitch applied") # synthetic gaussian noise if self.synthetic_gaussian_noise and self.train_status: if np.random.binomial(1, self.gauss_noise_prob): audio_data = synthetic_gaussian_noise_inject( audio_data, self.gauss_snr_db_range, self.augment_from_normal, logger=self.logger) if self.use_log: self.logger.info(f"preproc: synth_gauss_noise applied") # noise injection if self.background_noise and self.train_status: if np.random.binomial(1, self.background_noise_prob): audio_data = inject_noise(audio_data, samp_rate, self.noise_dir, self.background_noise_range, self.augment_from_normal, self.logger) if self.use_log: self.logger.info(f"preproc: noise injected") return audio_data, samp_rate
def test_apply_spec_augment_call(logger: Logger = None): """ Just tests if the apply_spec_augment can be called without errors Arguments: logger - Logger: can be taken as input to teset logger """ audio_paths = get_all_test_audio() for audio_path in audio_paths: audio_data, samp_rate = array_from_wave(audio_path) features = log_spectrogram_from_data(audio_data, samp_rate, window_size=32, step_size=16) apply_spec_augment(features, logger)
def test_high_snr_value(): test_audio_paths = get_all_test_audio() snr_level = 100 # absolute tolerance is 1e-5 of the range of values in pcm16 format (2**16) atol = 2**16 * 1e-5 for audio_path in test_audio_paths: audio_data, sr = array_from_wave(audio_path) augmented_data = synthetic_gaussian_noise_inject(audio_data, snr_range=(snr_level, snr_level)) np.testing.assert_allclose(audio_data, augmented_data, rtol=1e-03, atol=atol)
def test_regression_equal_pickle(): """ The pickle data is output from using the Speak-out.wav file with an snr_level = 30 and a random seed of zero """ pickle_path = "../test_pickle/sythentic-gaussian-noise-inject_Speak-out_snr-30.pickle" with open(pickle_path, 'rb') as fid: pickle_data = pickle.load(fid) audio_path = "../test_audio/Speak-out.wav" snr_level = 30 audio_data, sr = array_from_wave(audio_path) np.random.seed(0) augmented_data = synthetic_gaussian_noise_inject(audio_data, snr_range=(snr_level, snr_level)) assert (augmented_data == pickle_data ).sum() == augmented_data.size, "regression test fails"
def test_no_augment(): """ tests that input audio and augmented data are identical with no augmentation: tempo=1.0, gain=0 pitch = 0 """ tempo = 1.0 gain = 0.0 pitch = 0.0 audio_paths = get_all_test_audio() for audio_path in audio_paths: # un-augmented audio_data audio_data, samp_rate = array_from_wave(audio_path) aug_data, samp_rate = tempo_gain_pitch_perturb(audio_path, sample_rate=samp_rate, tempo_range=(tempo, tempo), gain_range=(gain, gain), pitch_range=(pitch, pitch)) assert all(audio_data == aug_data), "data is not the same"
def test_gain_increase_amplitude(): """ tests that 1) 6 dB increase in gain coorespondes to a 1.995 increase in the sum of the absolute value of the amplitudes and, 2) a 6 db decrease cooresponds to a 0.5 decrease in the sum abs value of amplitudes Ratio is computed as: ratio = 10**(gain/20) """ tempo = 1.0 pitch = 0.0 gain_ratio_tuples = [ (0, 1.0), # not augmentation (6, 1.995), (-6, 0.501) ] #(10, 3.162), # these two tests fail. #(-10, 0.3162) # I'm not sure why, likely an error in my approach. audio_paths = get_all_test_audio() # only using a single audio path for audio_path in audio_paths: print(f"audio_path: {audio_path}") for gain, amp_ratio in gain_ratio_tuples: # un-augmented audio_data audio_data, samp_rate = array_from_wave(audio_path) aug_data, samp_rate = tempo_gain_pitch_perturb( audio_path, sample_rate=samp_rate, tempo_range=(tempo, tempo), gain_range=(gain, gain), pitch_range=(pitch, pitch)) audio_rms = audioop.rms(audio_data, 2) scaled_aug_rms = audioop.rms(aug_data, 2) / amp_ratio accuracy = -1 # same up to 10^(-accuracy) print( f"audio rms: {audio_rms}, scaled_aug rms: {scaled_aug_rms}, ratio:{amp_ratio}, accuracy:{10**(-accuracy)}" ) np.testing.assert_almost_equal(audio_rms, scaled_aug_rms, decimal=accuracy)
def test_tempo_augment(): """ Verifies the size of the augmented data scaled by the tempo equals the size of the un-augmented data """ audio_paths = get_all_test_audio() tempos = [0, 0.5, 0.85, 1, 1.15, 2] for audio_path in audio_paths: # un-augmented audio_data audio_data, samp_rate = array_from_wave(audio_path) for tempo in tempos: aug_data, samp_rate = tempo_gain_pitch_perturb( audio_path, sample_rate=samp_rate, tempo_range=(tempo, tempo), gain_range=(0, 0), pitch_range=(0, 0)) print( f"audio_data size: {audio_data.size}, aug_data: {aug_data.size}, tempo: {tempo}" ) assert audio_data.size == pytest.approx(aug_data.size * tempo, 1e-1)
def compute_mean_std( audio_files: List[str], preprocessor: str, window_size: int, step_size: int, use_feature_normalize: bool) -> Tuple[np.ndarray, np.ndarray]: """ Compute the mean and std deviation of all of the feature bins (frequency bins if log_spec preprocessor). Will first normalize the audio samples if use_feature_normalize is true. Args: audio_files - List[str]: a list of shuffled audio files. len = max_samples preprocessor (str): name of preprocessor window_size - int: window_size of preprocessor step_size - int: step_size of preprocessor use_feature_normalize - bool: whether or not the features themselves are normalized Returns: mean - np.ndarray: the mean of the feature bins - shape = (# feature bins,) std - np.ndarray: the std deviation of the feature bins - shape = (# bins,) """ assert len(audio_files) > 0, "input list of audio_files is empty" samples = [] for audio_file in audio_files: audio_data, samp_rate = array_from_wave(audio_file) feature_array = process_audio(audio_data, samp_rate, window_size, step_size, preprocessor) if use_feature_normalize: feature_array = feature_normalize( feature_array) # normalize the feature samples.append(feature_array) # compute mean and std dev of the feature bins (along axis=0) # feature arrays aka samples are time x feature bin samples = np.vstack( samples) # stacks along time axis: shape = (all_time, feature bin) mean = np.mean(samples, axis=0, dtype='float32' ) # computes mean along time axis: shape = (feature bin,) std = np.std(samples, axis=0, dtype='float32') return mean, std
def process_audio(audio, samp_rate: int, window_size=32, step_size=16, processing='log_spectrogram'): """Processes audio through the provided processing function. Args: audio (str or np.ndarray): path to audio or audio array samp_rate (int): sample rate of audio window_size (int): size of window in processing function step_size (int): step in processing function processing (str): name of processing function. 'log_spectogram', 'mfcc', and 'log_mel' are acceptable. Returns: np.ndarray: processed array of dimensions: time x processor_bins """ assert isinstance(audio, (str, np.ndarray)), \ f"audio must be type str or np.ndarray, not {type(audio)}" # process audio from audio path if isinstance(audio, str): audio, samp_rate = array_from_wave(audio_path) audio = average_channels(audio) if processing == 'log_spectrogram': output = log_spectrogram(audio, samp_rate, window_size, step_size) elif processing == 'mfcc': output = mfcc(audio, samp_rate, window_size, step_size) elif processing == 'log_mel': output = log_mel_filterbank(audio, samp_rate, window_size, step_size) else: raise ValueError(f"processing value: {processing} is unacceptable") return output
def check_length(audio_path:str, noise_path:str, noise_level:float=0.5): audio_data, samp_rate = array_from_wave(audio_path) audio_noise = inject_noise_sample(audio_data, samp_rate, noise_path, noise_level=noise_level, logger=None)
def log_specgram_from_file(audio_file): audio, sr = wave.array_from_wave(audio_file) return log_specgram(audio, sr)
def log_specgram_from_file(audio_file, plot=False): audio, sr = wave.array_from_wave(audio_file) return log_specgram(audio, sr, plot=plot)
def test_load(): audio, samp_rate = wave.array_from_wave("test0.wav") assert samp_rate == 16000 assert audio.dtype == np.int16