def test_loadsound_librosa_mp3(): samples, sr = sp.loadsound(test_mp3, use_scipy=False, remove_dc=False) expected = np.array( [0.000e+00, -1.5258789e-05, 0.000e+00, 0.00e+00, 0.0000000e+00]) print('\nIF ERROR: could be due to update in Librosa from 0.7.2 to 0.8.0') assert np.allclose(samples[:5], expected) assert sr == 44100
def test_loadsound_librosa_flac(): samples, sr = sp.loadsound(test_flac, use_scipy=False, remove_dc=False) expected = np.array([ 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, -3.0517578e-05 ]) assert np.allclose(samples[:5], expected) assert sr == 44100
def test_resample_audio_sr22050_to_16000(): test_audio_1sec, sr = sp.loadsound(test_audiofile, dur_sec=1, sr=22050) assert sr == 22050 assert len(test_audio_1sec) == 22050 test_audio_newsr, sr_new = sp.dsp.resample_audio(test_audio_1sec, sr_original=sr, sr_desired=16000) assert sr_new == 16000 assert len(test_audio_newsr) == 16000
def test_loadsound_librosa_aiff_sr16000(): samples, sr = sp.loadsound(test_aiff, sr=16000, use_scipy=False, remove_dc=False) expected = np.array( [0.05152914, 0.03653815, -0.0083929, -0.0207656, -0.03038501]) assert np.allclose(samples[:5], expected) assert sr == 16000
def test_savesound_filename_wav2flac(): y, sr = sp.loadsound(test_wav_mono) f = sp.utils.string2pathlib(test_wav_mono) format_type = 'FLAC' audiofile_new = example_dir.joinpath(f.stem + '.' + format_type.lower()) audiofile_corrected = sp.savesound(audiofile_new, y, sr) soundobject = sf.SoundFile(audiofile_corrected) assert audiofile_corrected.suffix[1:].lower() == format_type.lower() assert soundobject.format == format_type os.remove(audiofile_corrected)
def audiofile_length_match(filename1, filename2): '''Checks that two audiofiles have the same length. This may be useful if you have clean and noisy audiofiles that should be the same length. Parameters ---------- filename1 : str or pathlib.PosixPath The path to first audio file. filename2 : str or pathlib.PosixPath The path to second audio file. Returns ------- bool : True if they match, False if not. Warning ------- UserWarning If the sample rate of the audio files don't match. UserWarning If the length of the files don't match. ''' y1, sr1 = sp.loadsound(filename1) y2, sr2 = sp.loadsound(filename2) if sr1 != sr2: import Warnings message = '\nWARNING: Sample rates do not match: '+\ '\n{} has sr {}'.format(filename1, sr1)+\ '\n{} has sr {}.'.format(filename2, sr2) warnings.warn(message) y2, sr2 = sp.dsp.resample_audio(y2, sr_original=sr2, sr_desired=sr1) assert sr1 == sr2 if len(y1) != len(y2): import warnings message = '\nWARNING: audiofile length mismatch. Length '+\ ' {}: \n{}'.format(filename1, len(y1))+\ 'Length {}: \n{}'.format(filename2, len(y2)) return False else: return True
def test_loadsound_mono_uselibrosa_False(): samples, sr = sp.loadsound(test_wav_stereo, use_scipy=True, remove_dc=False) expected = np.array( [0.06140351, 0.06140351, 0.06140351, 0.06140351, 0.06140351]) expected_shape = (len(expected), ) expected_sr = 16000 # sr of the audiofile (no default) assert np.allclose(samples[:5], expected) assert expected_shape == samples[:5].shape assert expected_sr == sr
def test_loadsound_mono_sr48000_uselibrosa_False(): samples, sr = sp.loadsound(test_wav_stereo, mono=True, sr=48000, use_scipy=True, remove_dc=False) expected = np.array( [0.07632732, 0.07633357, 0.07633357, 0.07632732, 0.07632107]) expected_sr = 48000 assert np.allclose(samples[:5], expected) assert sr == expected_sr
def test_loadsound_librosa_wav_dur1_sr22050(): # use librosa to load file samples, sr = sp.loadsound(test_wav_stereo, dur_sec=1, sr=22050, use_scipy=False, remove_dc=False) assert np.allclose(samples[:5], np.array([0., 0., 0., 0., 0.])) assert sr == 22050 assert len(samples) == sr
def test_loadsound_librosa_wav_dur1_sr22050_stereo(): # use librosa to load file samples, sr = sp.loadsound(test_wav_stereo, mono=False, dur_sec=1, sr=22050, use_scipy=False, remove_dc=False) expected = np.array([[0., 0.], [0., 0.], [0., 0.]]) assert np.allclose(samples[:3], expected) assert sr == 22050 assert samples.shape == (22050, 2)
def test_loadsound_stereo_sr48000_uselibrosa_False(): samples, sr = sp.loadsound(test_wav_stereo, sr=48000, mono=False, use_scipy=True, remove_dc=False) expected = np.array([[0.07632732, 0.07632732], [0.07633357, 0.07628564], [0.07633357, 0.07628563]]) expected_shape = expected.shape expected_sr = 48000 assert np.allclose(samples[:3], expected) assert expected_shape == samples[:3].shape assert expected_sr == sr
def test_loadsound_stereo_dur1_uselibrosa_False(): samples, sr = sp.loadsound(test_wav_stereo, mono=False, dur_sec=1, use_scipy=True, remove_dc=False) expected = np.array([[0.06140351, 0.06140351], [0.06140351, 0.06140351], [0.06140351, 0.06140351]]) expected_shape = expected.shape expected_sr = 16000 # sr of the audiofile (no default) assert np.allclose(samples[:3], expected) assert expected_shape == samples[:3].shape assert expected_sr == sr assert len(samples) == expected_sr
def get_samples(self, audiofile, dur_sec=None): """Load signal and save original volume Parameters ---------- audiofile : str Path and name of audiofile to be loaded dur_sec : int, float optional Max length of time in seconds (default None) Returns ---------- samples : ndarray Array containing signal amplitude values in time domain """ samples, sr = sp.loadsound(audiofile, self.sr, dur_sec=dur_sec) self.set_volume(samples, max_vol=self.max_vol) return samples
def test_get_feats_dur_sec_zeropad_False_mfcc(): dur_sec = 0.5 win_size_ms = 20 percent_overlap = 0.5 zeropad = False y, sr = sp.loadsound(test_audiofile, mono=True) y2 = sp.feats.get_feats(y, sr=sr, dur_sec=dur_sec, feature_type='mfcc', win_size_ms=win_size_ms, percent_overlap=percent_overlap, zeropad=zeropad) num_samples = int(sr * dur_sec) frame_length = sp.dsp.calc_frame_length(win_size_ms, sr) num_overlap_samples = int(frame_length * percent_overlap) num_subframes = sp.dsp.calc_num_subframes( num_samples, frame_length=frame_length, overlap_samples=num_overlap_samples, zeropad=zeropad) assert len(y2) == num_subframes
def test_get_feats_signal_mono_default_2channels_no_change(): y, sr = sp.loadsound(test_audiofile, mono=False) y2 = sp.feats.get_feats(y, sr=sr, feature_type='signal') assert y.shape == y2.shape assert y.shape[1] == 2
def test_get_feats_signal_mono_True_2channels(): y, sr = sp.loadsound(test_audiofile, mono=False) y2 = sp.feats.get_feats(y, sr=sr, feature_type='signal', mono=True) assert len(y2.shape) == 1 assert y.shape[1] == 2
except SyntaxError: pass ######################################################### # For the purposes of plotting, let's use some of the settings defined: feature_type = feat_settings['feature_type'] sr = feat_settings['sr'] ###################################################### # Provide new audio for the denoiser to denoise! # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ######################################################### # We'll use sample speech from the soundpy repo: speech = sp.string2pathlib('{}audiodata/python.wav'.format(sp_dir)) s, sr = sp.loadsound(speech, sr=sr) ######################################################### # Let's add some white noise (10 SNR) s_n = sp.augment.add_white_noise(s, sr=sr, snr=10) ############################################################## # What does the noisy audio sound like? # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ipd.Audio(s_n, rate=sr) ############################################################## # What does the noisy audio look like? # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sp.plotsound(s_n, sr=sr, feature_type='signal', subprocess=True)
def test_loadsound_scipy_sr_None(): samples, sr = sp.loadsound(test_wav_stereo, sr=None, use_scipy=True) assert sr == 16000
########################################################## # Noise sample: noise_sample = '{}audiodata/background_samples/cafe.wav'.format(sp_dir) noise_sample = sp.utils.string2pathlib(noise_sample) # as pathlib object, can do the following: noise = noise_sample.stem noise ########################################################## # Hear Clean Speech # ~~~~~~~~~~~~~~~~~ # I'm using a higher sample rate here as calculating SNR # performs best upwards of 44100 Hz. sr = 44100 s, sr = sp.loadsound(speech_sample, sr=sr) ipd.Audio(s, rate=sr) ########################################################## # Hear Noise # ~~~~~~~~~~ n, sr = sp.loadsound(noise_sample, sr=sr) ipd.Audio(n, rate=sr) ########################################################## # Hear Signal-to-Noise Ratio 20 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ noisyspeech_20snr, snr20 = sp.dsp.add_backgroundsound(speech_sample, noise_sample, sr=sr, snr=20)
########################################################## # Designate the path relevant for accessing audiodata # Note: the speech and sound come with the soundpy repo. sp_dir = '../../../' ########################################################## # Speech sample: speech = '{}audiodata/python.wav'.format(sp_dir) speech = sp.utils.string2pathlib(speech) ################################################################ # Hear and see speech # ~~~~~~~~~~~~~~~~~~~ sr = 44100 f, sr = sp.loadsound(speech, sr=sr) ipd.Audio(f,rate=sr) ########################################################## sp.plotsound(f, sr=sr, feature_type='stft', title='Female Speech "Python"') ########################################################## # Change Speed # ~~~~~~~~~~~~ ########################################################## # Let's increase the speed by 15%: fast = sp.augment.speed_increase(f, sr=sr, perc = 0.15) ##########################################################
def test_savesound_default_overwrite(): y, sr = sp.loadsound(test_wav_mono) soundobject1 = sf.SoundFile(test_wav_mono) filename = sp.savesound(test_wav_mono, y, sr, overwrite=True) soundobject2 = sf.SoundFile(filename) assert soundobject1.format == soundobject2.format
def augment_features( sound, sr, add_white_noise=False, snr=[5, 10, 20], speed_increase=False, speed_decrease=False, speed_perc=0.15, time_shift=False, shufflesound=False, num_subsections=3, harmonic_distortion=False, pitch_increase=False, pitch_decrease=False, num_semitones=2, vtlp=False, bilinear_warp=True, augment_settings_dict=None, random_seed=None, ): '''Randomly applies augmentations to audio. If no `augment_settings_dict`, defaults applied. ''' if augment_settings_dict is not None: aug_settings = dict(augment_settings_dict) else: aug_settings = augment_settings_dict if speed_increase and speed_decrease: raise ValueError('Cannot have both speed_increase and speed_decrease'+\ ' as augmentation options. Set just one to True.') if pitch_increase and pitch_decrease: raise ValueError('Cannot have both pitch_increase and pitch_decrease'+\ ' as augmentation options. Set just one to True.') if isinstance(sound, np.ndarray): data = sound else: data, sr2 = sp.loadsound(sound, sr=sr) assert sr2 == sr samples = data.copy() samples_augmented = samples.copy() augmentation = '' if add_white_noise: # allow default settings to be used/overwritten if aug_settings is not None: kwargs_aug = aug_settings['add_white_noise'] if isinstance(kwargs_aug['snr'], str): kwargs_aug['snr'] = sp.utils.restore_dictvalue( kwargs_aug['snr']) # if a list of snr values: choose randomly if isinstance(kwargs_aug['snr'], list): snr = np.random.choice(kwargs_aug['snr']) else: snr = np.random.choice(snr) samples_augmented = sp.augment.add_white_noise(samples_augmented, sr=sr, snr=snr) augmentation += '_whitenoise{}SNR'.format(snr) if speed_increase: if aug_settings is not None: kwargs_aug = aug_settings['speed_increase'] else: kwargs_aug = dict([('perc', speed_perc)]) samples_augmented = sp.augment.speed_increase(samples_augmented, sr=sr, **kwargs_aug) augmentation += '_speedincrease{}'.format(kwargs_aug['perc']) elif speed_decrease: if aug_settings is not None: kwargs_aug = aug_settings['speed_decrease'] else: kwargs_aug = dict([('perc', speed_perc)]) samples_augmented = sp.augment.speed_decrease(samples_augmented, sr=sr, **kwargs_aug) augmentation += '_speeddecrease{}'.format(kwargs_aug['perc']) if time_shift: samples_augmented = sp.augment.time_shift(samples_augmented, sr=sr) augmentation += '_randtimeshift' if shufflesound: if aug_settings is not None: kwargs_aug = aug_settings['shufflesound'] else: kwargs_aug = dict([('num_subsections', num_subsections)]) samples_augmented = sp.augment.shufflesound(samples_augmented, sr=sr, **kwargs_aug) augmentation += '_randshuffle{}sections'.format( kwargs_aug['num_subsections']) if harmonic_distortion: samples_augmented = sp.augment.harmonic_distortion(samples_augmented, sr=sr) augmentation += '_harmonicdistortion' if pitch_increase: if aug_settings is not None: kwargs_aug = aug_settings['pitch_increase'] else: kwargs_aug = dict([('num_semitones', num_semitones)]) samples_augmented = sp.augment.pitch_increase(samples_augmented, sr=sr, **kwargs_aug) augmentation += '_pitchincrease{}semitones'.format( kwargs_aug['num_semitones']) elif pitch_decrease: if aug_settings is not None: kwargs_aug = aug_settings['pitch_decrease'] else: kwargs_aug = dict([('num_semitones', num_semitones)]) samples_augmented = sp.augment.pitch_decrease(samples_augmented, sr=sr, **kwargs_aug) augmentation += '_pitchdecrease{}semitones'.format( kwargs_aug['num_semitones']) # all augmentation techniques return sample data except for vtlp # therefore vtlp will be handled outside of this function (returns stft or powspec) if vtlp: pass samples_augmented = sp.dsp.set_signal_length(samples_augmented, len(samples)) return samples_augmented, augmentation
def generator(self): '''Extracts features and feeds them to model according to `desired_input_shape`. ''' while 1: augmentation = '' audioinfo = self.audiolist[self.counter] # does the list contain label audiofile pairs? if isinstance(audioinfo, tuple): if len(audioinfo) != 2: raise ValueError('Expected tuple containing audio file path and label. '+\ 'Instead received tuple of length: \n{}'.format(len(audioinfo))) # if label is a string digit, int, or float - turn to int if isinstance(audioinfo[0], int) or isinstance(audioinfo[0], float) or \ isinstance(audioinfo[0], str) and audioinfo[0].isdigit(): label = int(audioinfo[0]) audiopath = audioinfo[1] elif isinstance(audioinfo[1], int) or isinstance(audioinfo[1], float) or \ isinstance(audioinfo[1], str) and audioinfo[1].isdigit(): label = int(audioinfo[1]) audiopath = audioinfo[1] else: raise ValueError('Expected tuple to contain an integer label '+\ 'and audio pathway. Received instead tuple with types '+\ '{} and {}.'.format(type(audioinfo[0]), type(audioinfo[1]))) # otherwise list of audiofiles else: audiopath = audioinfo label = None if self.audiolist2 is not None: # expects audiolist2 to be either integer labels or audiofile pathways audioinfo2 = self.audiolist2[self.counter] if isinstance(audioinfo2, int) or isinstance(audioinfo2, str) and \ audioinfo2.isdigit(): if label is None: label = audioinfo2 else: if label == int(audioinfo2): pass else: raise ValueError('Provided conflicting labels for '+\ 'current audiofile: {}.'.format(audiopath) +\ '\nReceived both label {} and {} .'.format( label, int(audioinfo2))) audiopath2 = None else: audiopath2 = audioinfo2 else: audiopath2 = None if label is not None: labeled_data = True if self.decode_dict is not None: try: label_pic = self.decode_dict[label].upper() except KeyError: # dictionary keys might be string type, not int type label_pic = self.decode_dict[str(int(label))].upper() else: label_pic = label else: labeled_data = False label_pic = None # ensure audio is valid: y, sr = sp.loadsound(audiopath, self.kwargs['sr']) if audiopath2: y2, sr2 = sp.loadsound(audiopath2, self.kwargs['sr']) else: y2, sr2 = None, None if self.label_silence: if self.vad_start_end: y_stft, vad = sp.dsp.get_stft_clipped(y, sr=sr, win_size_ms=50, percent_overlap=0.5) else: y_stft, __ = sp.feats.get_vad_stft(y, sr=sr, win_size_ms=50, percent_overlap=0.5, use_beg_ms=120, energy_thresh=40, freq_thresh=185, sfm_thresh=5) if not y_stft.any(): label = len(self.decode_dict) - 1 print( '\nNo voice activity detected in {}'.format(audiopath)) print('Label {} adjusted to {}.'.format( label_pic, self.decode_dict[label])) label_pic = self.decode_dict[label] # augment_data if self.augment_dict is not None: aug_dict = randomize_augs(self.augment_dict) augmented_data, augmentation = augment_features( y, self.kwargs['sr'], **aug_dict) if audiopath2: # remove 'add_white_noise' if in aug_dict aug_dict2 = {} for key, value in aug_dict.items(): if key != 'add_white_noise': aug_dict2[key] = value augmented_data2, augmentation2 = augment_features( y2, self.kwargs['sr'], **aug_dict2) else: augmented_data, augmentation = y, '' aug_dict = dict() augmented_data2, augmentation2 = y2, '' aug_dict2 = dict() # extract features # will be shape (num_frames, num_features) if 'vtlp' in aug_dict and aug_dict['vtlp']: sr = self.kwargs['sr'] win_size_ms = sp.utils.restore_dictvalue( self.kwargs['win_size_ms']) percent_overlap = sp.utils.restore_dictvalue( self.kwargs['percent_overlap']) fft_bins = sp.utils.restore_dictvalue(self.kwargs['fft_bins']) window = sp.utils.restore_dictvalue(self.kwargs['window']) real_signal = sp.utils.restore_dictvalue( self.kwargs['real_signal']) feature_type_vtlp = 'stft' dur_sec = sp.utils.restore_dictvalue(self.kwargs['dur_sec']) zeropad = sp.utils.restore_dictvalue(self.kwargs['zeropad']) # need to tell vtlp the size of fft we need, in order to # be able to extract fbank and mfcc features as well expected_stft_shape, __ = sp.feats.get_feature_matrix_shape( sr=sr, dur_sec=dur_sec, feature_type=feature_type_vtlp, win_size_ms=win_size_ms, percent_overlap=percent_overlap, fft_bins=fft_bins, zeropad=zeropad, real_signal=real_signal) # TODO bug fix: oversize_factor higher than 1: # how to reduce dimension back to `expected_stft_shape` without # shaving off data? oversize_factor = 16 augmented_data, alpha = sp.augment.vtlp( augmented_data, sr, win_size_ms=win_size_ms, percent_overlap=percent_overlap, fft_bins=fft_bins, window=window, real_signal=real_signal, expected_shape=expected_stft_shape, oversize_factor=oversize_factor, visualize=False) # vtlp was last augmentation to be added to `augmentation` string # add the value that was applied augmentation += '_vtlp' + str(alpha) # need to be able to set alpha augmented_data2, alpha2 = sp.augment.vtlp( augmented_data2, sr, a=alpha, win_size_ms=win_size_ms, percent_overlap=percent_overlap, fft_bins=fft_bins, window=window, real_signal=real_signal, expected_shape=expected_stft_shape, oversize_factor=oversize_factor, visualize=False) try: assert alpha == alpha2 except AssertionError: raise ValueError('The alpha value for vtlp application '+\ 'does not match for the X and y audio: '+\ 'X alpha is {} and y alpha is {}'.format(alpha, alpha2)) # vtlp was last augmentation to be added to `augmentation` string # add the value that was applied augmentation2 += '_vtlp' + str(alpha) if 'vtlp' in aug_dict and aug_dict['vtlp']: if 'stft' in self.kwargs['feature_type'] or \ 'powspec' in self.kwargs['feature_type']: if 'stft' in self.kwargs[ 'feature_type'] and oversize_factor > 1: import warnings msg = '\nWARNING: due to resizing of STFT matrix due to '+\ ' `oversize_factor` {}, converted to '.format(oversize_factor)+\ 'power spectrum. Phase information has been removed.' warnings.warn(msg) feats = augmented_data if audiopath2: feats2 = augmented_data2 if 'powspec' in self.kwargs[ 'feature_type'] and oversize_factor == 1: # otherwise already a power spectrum feats = sp.dsp.calc_power(feats) if audiopath2: feats2 = sp.dsp.calc_power(feats2) elif 'stft'in self.kwargs['feature_type'] or \ 'powspec' in self.kwargs['feature_type']: feats = sp.feats.get_stft( augmented_data, sr=self.kwargs['sr'], win_size_ms=self.kwargs['win_size_ms'], percent_overlap=self.kwargs['percent_overlap'], real_signal=self.kwargs['real_signal'], fft_bins=self.kwargs['fft_bins'], rate_of_change=self.kwargs['rate_of_change'], rate_of_acceleration=self.kwargs['rate_of_acceleration'], window=self.kwargs['window'], zeropad=self.kwargs['zeropad']) if audiopath2: feats2 = sp.feats.get_stft( augmented_data2, sr=self.kwargs['sr'], win_size_ms=self.kwargs['win_size_ms'], percent_overlap=self.kwargs['percent_overlap'], real_signal=self.kwargs['real_signal'], fft_bins=self.kwargs['fft_bins'], rate_of_change=self.kwargs['rate_of_change'], rate_of_acceleration=self. kwargs['rate_of_acceleration'], window=self.kwargs['window'], zeropad=self.kwargs['zeropad']) if 'powspec' in self.kwargs['feature_type']: feats = sp.dsp.calc_power(feats) if audiopath2: feats2 = sp.dsp.calc_power(feats2) if 'fbank' in self.kwargs['feature_type']: feats = sp.feats.get_fbank( augmented_data, sr=self.kwargs['sr'], num_filters=self.kwargs['num_filters'], win_size_ms=self.kwargs['win_size_ms'], percent_overlap=self.kwargs['percent_overlap'], real_signal=self.kwargs['real_signal'], fft_bins=self.kwargs['fft_bins'], rate_of_change=self.kwargs['rate_of_change'], rate_of_acceleration=self.kwargs['rate_of_acceleration'], window=self.kwargs['window'], zeropad=self.kwargs['zeropad']) if audiopath2: feats2 = sp.feats.get_fbank( augmented_data2, sr=self.kwargs['sr'], num_filters=self.kwargs['num_filters'], win_size_ms=self.kwargs['win_size_ms'], percent_overlap=self.kwargs['percent_overlap'], real_signal=self.kwargs['real_signal'], fft_bins=self.kwargs['fft_bins'], rate_of_change=self.kwargs['rate_of_change'], rate_of_acceleration=self. kwargs['rate_of_acceleration'], window=self.kwargs['window'], zeropad=self.kwargs['zeropad']) elif 'mfcc' in self.kwargs['feature_type']: feats = sp.feats.get_mfcc( augmented_data, sr=self.kwargs['sr'], num_mfcc=self.kwargs['num_mfcc'], num_filters=self.kwargs['num_filters'], win_size_ms=self.kwargs['win_size_ms'], percent_overlap=self.kwargs['percent_overlap'], real_signal=self.kwargs['real_signal'], fft_bins=self.kwargs['fft_bins'], rate_of_change=self.kwargs['rate_of_change'], rate_of_acceleration=self.kwargs['rate_of_acceleration'], window=self.kwargs['window'], zeropad=self.kwargs['zeropad']) if audiopath2: feats2 = sp.feats.get_mfcc( augmented_data2, sr=self.kwargs['sr'], num_mfcc=self.kwargs['num_mfcc'], num_filters=self.kwargs['num_filters'], win_size_ms=self.kwargs['win_size_ms'], percent_overlap=self.kwargs['percent_overlap'], real_signal=self.kwargs['real_signal'], fft_bins=self.kwargs['fft_bins'], rate_of_change=self.kwargs['rate_of_change'], rate_of_acceleration=self. kwargs['rate_of_acceleration'], window=self.kwargs['window'], zeropad=self.kwargs['zeropad']) if self.apply_log: # TODO test if feats[0].any() < 0: feats = np.abs(feats) feats = np.log(feats) if self.normalize: feats = sp.feats.normalize(feats) if audiopath2: if self.apply_log: # TODO test if feats2[0].any() < 0: feats2 = np.abs(feats2) feats2 = np.log(feats2) if self.normalize: feats2 = sp.feats.normalize(feats2) else: feats2 = None # Save visuals if desired if self.visualize: if self.counter % self.vis_every_n_items == 0: # make augmentation string more legible. augments_vis = augmentation[1:].split('_') if len(augments_vis) > 1: augs1 = augments_vis[:len(augments_vis) // 2] augs2 = augments_vis[len(augments_vis) // 2:] augs1 = ', '.join(augs1) augs2 = ', '.join(augs2) else: augs1 = augments_vis[0] augs2 = '' if self.visuals_dir is not None: save_visuals_path = sp.check_dir(self.visuals_dir, make=True) else: save_visuals_path = sp.check_dir('./training_images/', make=True) save_visuals_path = save_visuals_path.joinpath( '{}_label{}_training_{}_{}_{}.png'.format( self.dataset, label_pic, self.model_name, augmentation, sp.utils.get_date())) feature_type = self.kwargs['feature_type'] sr = self.kwargs['sr'] win_size_ms = self.kwargs['win_size_ms'] percent_overlap = self.kwargs['percent_overlap'] if 'stft' in feature_type or 'powspec' in feature_type or 'fbank' \ in feature_type: energy_scale = 'power_to_db' else: energy_scale = None sp.feats.plot( feature_matrix=feats, feature_type=feature_type, sr=sr, win_size_ms=win_size_ms, percent_overlap=percent_overlap, energy_scale=energy_scale, save_pic=True, name4pic=save_visuals_path, title='"{}" {} Aug: {}-\n{}'.format( label_pic, feature_type.upper(), augs1, augs2), subprocess=True) #use Agg backend for plotting if feats2 is not None: # add '_2' to pathway p = sp.utils.string2pathlib(save_visuals_path) p2 = p.name.stem save_visuals_path2 = p.parent.joinpath(p2 + '_2' + p.name.suffix) sp.feats.plot(feature_matrix=feats2, feature_type=feature_type, sr=sr, win_size_ms=win_size_ms, percent_overlap=percent_overlap, energy_scale=energy_scale, save_pic=True, name4pic=save_visuals_path2, title='Output {} features {}'.format( label_pic, feature_type), subprocess=True) batch_x = feats batch_y = feats2 # reshape features to allow for timestep / subsection features if self.timestep is not None: batch_x = sp.feats.apply_new_subframe( batch_x, new_frame_size=self.timestep, zeropad=self.kwargs['zeropad'], axis=self.axis_timestep) if batch_y is not None: batch_y = sp.feats.apply_new_subframe( batch_y, new_frame_size=self.timestep, zeropad=self.kwargs['zeropad'], axis=self.axis_timestep) # reshape features to allow for context window / subsection features if self.context_window is not None: batch_x = sp.feats.apply_new_subframe( batch_x, new_frame_size=self.context_window * 2 + 1, zeropad=self.kwargs['zeropad'], axis=self.axis_context) if batch_y is not None: batch_y = apply_new_subframe( batch_y, new_frame_size=self.context_window * 2 + 1, zeropad=self.kwargs['zeropad'], axis=self.axis_context) # grayscale 2 color if self.gray2color: batch_x = sp.feats.grayscale2color( batch_x, colorscale=3) # default colorscale is 3 if batch_y is not None: batch_y = sp.feats.grayscale2color(batch_y, colorscale=3) # reshape to input shape. Will be zeropadded or limited to this shape. # tensor dimensions on either side can be added here as well. if self.desired_input_shape is not None: batch_x = sp.feats.adjust_shape(batch_x, self.desired_input_shape) if batch_y is not None: batch_y = sp.feats.adjust_shape(batch_y, self.desired_input_shape) # prepare data to be fed to network: if labeled_data: # has to be at least (1,) batch_y = np.expand_dims(np.array(label), axis=0) elif batch_y is not None: pass else: raise ValueError('No independent variable provided.') self.counter += 1 yield batch_x, batch_y #restart counter to yield data in the next epoch as well if self.counter >= self.number_of_batches: self.counter = 0
def test_loadsound_librosa_aiff(): samples, sr = sp.loadsound(test_aiff, use_scipy=False, remove_dc=False) expected = np.array( [0.09291077, 0.06417847, 0.04179382, 0.02642822, 0.01808167]) assert np.allclose(samples[:5], expected) assert sr == 48000
def test_loadsound_librosa_ogg(): samples, sr = sp.loadsound(test_ogg, use_scipy=False, remove_dc=False) expected = np.array( [-0.00639889, -0.00722905, -0.00864992, -0.00878596, -0.00894831]) assert np.allclose(samples[:5], expected) assert sr == 44100
def test_loadsound_librosa_m4a(): samples, sr = sp.loadsound(test_m4a, use_scipy=False, remove_dc=False) expected = np.array([0., 0., 0., 0., 0.]) assert np.allclose(samples[:5], expected) assert sr == 48000
def test_loadsound_librosa_sr_None(): samples, sr = sp.loadsound(test_wav_stereo, sr=None) assert sr == 16000
def test_get_feats_dur_sec_signal(): dur_sec = 0.5 y, sr = sp.loadsound(test_audiofile, mono=True) y2 = sp.feats.get_feats(y, sr=sr, dur_sec=dur_sec, feature_type='signal') num_samps = int(sr * dur_sec) assert len(y2) == num_samps
###################################################### # Load sample speech audio # ------------------------ # We will look at how these two options handle two different speech samples. # The speech samples will be combined but separated by a silence. # They will also be altered with white noise. ###################################################### # "Python" # ~~~~~~~~ # Note: this file is available in the soundpy repo. # VAD and filtering work best with high sample rates sr = 48000 python = '{}audiodata/python.wav'.format(sp_dir, sr=sr) y_p, sr = sp.loadsound(python, sr=sr) ipd.Audio(y_p, rate=sr) ###################################################### # "six" # ~~~~~ # This is a sample file from the speech commands dataset # (Attribution 4.0 International (CC BY 4.0)) # dataset: https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.htmll # license: https://creativecommons.org/licenses/by/4.0/ ###################################################### # This is audio that has two fricatives in it: 's' and 'x' # which will show to cause issues as noise increases. six = '{}audiodata/six.wav'.format(sp_dir, sr=sr) y_six, sr = sp.loadsound(six, sr=sr)
def test_savesound_default_FileExistsError(): y, sr = sp.loadsound(test_wav_mono) with pytest.raises(FileExistsError): filename = sp.savesound(test_wav_mono, y, sr)