def collect_wavs(filenames, dest_fs=None): """ Collects and packages a set of wav files to an array of samples Args: filenames: File locations as a list dest_fs: Sampling frequency to use Returns: An array of the samples of the files as [N_files x N_samples] """ import numpy as np from scipy.io.wavfile import read from utils_spaudio import my_resample if not (isinstance(filenames, list) or isinstance(filenames, tuple)): filenames = [filenames] samples = [] max_len = 0 for the_filename in filenames: fs, new_samples = read(the_filename) if dest_fs: new_samples = my_resample(new_samples, fs, dest_fs) if new_samples.ndim == 1: new_samples = np.atleast_2d(new_samples).T max_len = max(max_len, new_samples.shape[0]) samples.append(new_samples) for i in range(len(samples)): this_len = samples[i].shape[0] missing = max_len - this_len if missing > 0: samples[i] = np.concatenate((samples[i], np.zeros( (missing, samples[i].shape[1]), dtype=samples[i].dtype))) out = np.concatenate([samples[i].T for i in range(len(samples))], axis=0) return out
def viz_net(model_loc, air_loc=None, nrows=4, interactive=False, channel=0, layer_idx=0, speechfile=None): """ Visualization worker. Accepts the model and a set of audio data which can be filtered by the network to provide the visualizations. If no input audio data are given, then the filter kernels are visualised. Args: model_loc: The location of the model saved by keras an HDF5 dataset air_loc: Location of AIR file .wav nrows: Number of rows used in plotting interactive: Interactive plotting (waits for you to close the plots) channel: Channel of the AIR to do layer_idx: =n. The output of the n-th convolutional layer will be used to collect the feature maps. speechfile: A speech file which will be convolved with the AIR before filtering. Returns: """ try: from os.path import basename except ImportError: raise from scipy.signal import fftconvolve outdir = '/tmp/training_surface_models/' + basename(model_loc).replace( '.h5', '') + '/' i_made_the_model = False try: model = load_model(model_loc) except ValueError as ME1: i_made_the_model = True try: from ace_discriminative_nets import get_model_speech print('Failed to use default load for model ' + model_loc + ' wil try for speech cnn because ' + ME1.message) model = get_model_speech((500, 161), 7, use_cnn=True) model.load_weights(model_loc, by_name=True) print('CNN model constructed OK') except ValueError as ME2: try: print('Failed to use cnn model ' + model_loc + ' wil try for speech cnn-rnn because ' + ME2.message) from ace_discriminative_nets import get_model_speech model = get_model_speech((500, 161), 7, use_cnn=True, use_rnn=True) model.load_weights(model_loc, by_name=True) print('CNN-RNN model constructed OK') except ValueError as ME3: print('Failed to use cnn-rnn model ' + model_loc + ' wil try for speech cnn-rnn because ' + ME3.message) raise ME1 if air_loc is None: viz_net_individual(model, outdir, nrows=nrows, interactive=interactive) return conv_layers = [] conv_layers_idxs = [] for idx, i in enumerate(model.layers): if isinstance(i, Conv2D): conv_layers.append(i) conv_layers_idxs.append(idx) from keras.models import Model if layer_idx == -1: effective_idx = 1 else: if layer_idx >= len(conv_layers_idxs): effective_idx = conv_layers_idxs[-1] + 2 print( 'I will assume that you want the next layer after the last conv layer which i will ' 'assume is a max poolign layer') else: effective_idx = conv_layers_idxs[layer_idx] print('Picking Layer ' + model.layers[effective_idx].name) model = Model(inputs=[model.input], outputs=[model.layers[effective_idx].output]) if speechfile is not None: fs_speech, x_speech = wavfile.read(speechfile) if x_speech.ndim > 1: x_speech = x_speech[:, 0] else: x_speech = None fs_speech = None for this_air in air_loc: if this_air == 'white': suffix = '_white' in_shape = model.input_shape[1:] x = np.atleast_2d( np.random.normal(0, 1., size=np.prod(in_shape) * 4)) else: suffix = '_' + run_command('basename ' + this_air)[0] print('Loading: ' + this_air) fs, x = wavfile.read(this_air) x = x[:, channel] if speechfile is not None: print('Will convolve with ' + speechfile) if not fs_speech == fs: x_speech_effective = my_resample( x_speech[0:int(max_speech_len * fs_speech)], fs_speech, fs) else: x_speech_effective = x_speech x_speech_effective.setflags(write=1) if trim_speech_to is not None: x_speech_effective[int(trim_speech_to * fs_speech):] = 0 x = fftconvolve(x_speech_effective, x, mode='same') else: if i_made_the_model: raise AssertionError( 'Because the default model-load failed i was going ' 'to try to construct speech ' 'models but you did not provide any speech data') if x.ndim > 1: x = x[:, 0] x = np.atleast_2d(x) suffix += '_l' + str(layer_idx) viz_net_other(model, x, suffix, outdir, nrows=nrows, interactive=interactive, doing_speech=speechfile is not None)
def read_airs_from_wavs(wav_files, framesize=None, get_pow_spec=True, max_air_len=None, fs=None, forced_fs=None, keep_ids=None, cacheloc='/tmp/', start_at_max=True, read_cached_latest=False, wavform_logpow=False, write_cached_latest=True, max_speech_read=None, max_air_read=None, utt_per_env=1, parse_as_dirctories=True, speech_files=None, save_speech_associations=True, save_speech_examples=10, drop_speech=False, as_hdf5_ds=True, choose_channel=None, no_fex=False, scratchpad='/tmp/', copy_associations_to=None, given_associations=None): """ Given a set of AIR files and additional inforamtion, data for the training of DNNs for environment classification are prepared. Args: wav_files: Location of AIR wav files framesize: The framesize to ues get_pow_spec: Convert audio to log-power spectrum domain max_air_len: The maximum length of the signals (truncate to or pad to) fs: The sampling frequency of the wav fiels to expect forced_fs: The sampling frequency to convert the data to keep_ids: None (not used) cacheloc: Location to use for cache reading and saving start_at_max: Modify the signals so that the maximum energy sample is at the begiing. ( can be used to align AIRs) read_cached_latest: Read the data from the last saved cache (if nay) wavform_logpow: Get the signals in the log-power time domain write_cached_latest: Write the collected data in a cache for fast reuse max_speech_read: Maximum length of speech signal to read max_air_read: maximum aIR length to read up to utt_per_env: Number of utternaces to convolve with each AIR parse_as_dirctories: Parse the inputs as directiries and not as individual fiels speech_files: Speec files of locations save_speech_associations: Save the speech associations with the corresponding AIRs save_speech_examples: Enable the saving of examples of the reverberant speech created drop_speech: Do not include the speech samples in the saving of the cache or in the RAM. Keep only the training data arrays as_hdf5_ds: Keep the data as HDF5 datasets on disk. (Reduces RAM usage a lot) choose_channel: Channels to use for each AIR no_fex: Skip the data processign phase and just return the raw singals scratchpad: Location to use for temporary saving copy_associations_to: Save a copy of the speech-aIR associations here given_associations: Provided associatiosn between speech files and AIRs. This can be used in the case where you want to use specific speech samples for specific AIRs Returns: (X, None), Sample_names, None, (AIRs, Speech, Reverberant_speech), (Group_name, Groups), Number_of_utternaces_convolved_with_each_AIR """ try: from os.path import isfile, basename except ImportError: raise from scipy.signal import fftconvolve import numpy as np from h5py import File from scipy.io import wavfile from utils_spaudio import my_resample, write_wav from utils_base import find_all_ft, run_command from random import sample import pandas as pd from random import randint from time import time run_command('mkdir -p ' + cacheloc) latest_file = cacheloc + '/training_test_data_wav.h5' timestamp = str(time()) filename_associations = scratchpad + '/air_speech_associations_' + timestamp + '.csv' base_examples_dir = scratchpad + '/feature_extraction_examples/' if keep_ids is not None: raise AssertionError('No ids exist in this context') if speech_files is None: utt_per_env = 1 if save_speech_associations: print( 'There is no speech to save in associations, setting to false') save_speech_associations = False if save_speech_examples: print( 'There is no speech to save audio for, setting to 0 examples') save_speech_examples = 0 try: hf = None if isfile(latest_file) and read_cached_latest: print('Reading :' + latest_file) hf = File(latest_file, 'r') if as_hdf5_ds: x = hf['x'] ids = hf['ids'] airs = hf['airs'] utt_per_env = np.array(hf['utts']) rev_speech = hf['rev_names'] clean_speech = hf['clean_speech'] print('Done creating handles to : ' + latest_file) else: utt_per_env = np.array(hf['utts']) x = np.array(hf.get('x')) ids = np.array(hf.get('ids')) airs = np.array(hf.get('airs')) rev_speech = np.array(hf.get('rev_names')) clean_speech = np.array(hf.get('clean_speech')) print('Done reading : ' + latest_file) if given_associations is not None: print( '! I read the cache so the given associations were not used' ) if copy_associations_to is not None: print( '! I read the cache so the associations could not be saved at ' + copy_associations_to) return (x, None), ids, None, (airs, clean_speech, rev_speech), utt_per_env except (ValueError, KeyError) as ME: print('Tried to read ' + latest_file + ' but failed with ' + ME.message) if hf is not None: hf.close() if given_associations is not None: print('You gave me speech associations, Speech: ' + str(len(given_associations['speech'])) + ' entries and Offsets: ' + str(len(given_associations['speech'])) + ' entries') ids = None x = None x_speech = None x_rev_speech = None if forced_fs is None: forced_fs = fs resample_op = lambda x: x if not forced_fs == fs: resample_op = lambda x: np.array( my_resample(np.array(x.T, dtype=float), fs, forced_fs)).T if max_air_read is not None: if fs is None: raise AssertionError('Cannot work with max_air_read without fs') max_air_read_samples = int(np.ceil(fs * max_air_read)) if max_speech_read is not None: if fs is None: raise AssertionError('Cannot work with max_speech_read without fs') max_speech_read_samples = int(np.ceil(fs * max_speech_read)) else: max_speech_read_samples = None if parse_as_dirctories: if not type(wav_files) is list: wav_files = [wav_files] wav_files = find_all_ft(wav_files, ft='.wav', find_iname=True) if speech_files is not None: if not type(speech_files) is list: speech_files = [speech_files] speech_files = find_all_ft(speech_files, ft='.wav', find_iname=True) if save_speech_examples: run_command('rm -r ' + base_examples_dir) run_command('mkdir -p ' + base_examples_dir) associations = [] save_counter = 0 all_names = [ basename(i).replace('.wav', '') + '_' + str(j) for i in wav_files for j in range(utt_per_env) ] if type(choose_channel) is list: choose_channel = [ i for i in choose_channel for _ in range(utt_per_env) ] wav_files = [i for i in wav_files for _ in range(utt_per_env)] offsets = [] for i, this_wav_file in enumerate(wav_files): if False and speech_files is not None: print "Reading: " + this_wav_file + " @ " + str( i + 1) + " of " + str(len(wav_files)), names = [all_names[i]] this_fs, airs = wavfile.read(this_wav_file) airs = airs.astype(float) if airs.ndim > 1: if choose_channel is not None: if type(choose_channel) is list: airs = airs[:, choose_channel[i]] names[0] += '_ch' + str(choose_channel[i]) else: airs = airs[:, choose_channel] names[0] += '_ch' + str(choose_channel) else: names = [ names[0] + '_' + str(ch_id) for ch_id in range(airs.shape[1]) ] airs = airs.T airs = np.atleast_2d(airs) airs /= np.repeat(np.atleast_2d(abs(airs).max()).T, airs.shape[1], 1).astype(float) if airs.shape[0] > 1 and given_associations is not None: raise AssertionError( 'Cannot work out given associations for multichannel airs') this_speech_all = [] this_rev_speech_all = [] if speech_files is not None: for ch_id in range(airs.shape[0]): if given_associations is None: chosen_file = sample(range(len(speech_files)), 1)[0] this_speech_file = speech_files[chosen_file] else: chosen_file = given_associations['speech'][i] this_speech_file = chosen_file associations.append(chosen_file) this_speech_fs, this_speech = wavfile.read(this_speech_file) if this_speech.ndim > 1: raise AssertionError( 'Can\'t deal with multichannel speech in this context') if not this_speech_fs == this_fs: this_speech = my_resample(this_speech, this_speech_fs, this_fs) max_offset_for_check = None if max_speech_read_samples is not None: max_offset_for_check = this_speech.size - max_speech_read_samples offset = randint( 0, this_speech.size - max_speech_read_samples) this_speech = this_speech[offset:offset + max_speech_read_samples] else: offset = 0 if given_associations is not None: offset = given_associations['offsets'][i] if max_speech_read_samples is not None: if offset >= max_offset_for_check: raise AssertionError( 'Invalid offset from given associations, got ' + str(offset) + ' expected max is ' + str(this_speech.size - max_speech_read_samples)) conv_air = np.array(airs[ch_id, :]) conv_air = conv_air[np.where(~(conv_air == 0))[-1][0]:np. where(~(conv_air == 0))[-1][-1]] # Making convolution this_rev_speech = fftconvolve(this_speech, conv_air, 'same') # dp_arival = np.argmax(abs(conv_air)) this_rev_speech = this_rev_speech[dp_arival:] if dp_arival > 0: this_rev_speech = np.concatenate( (this_rev_speech, np.zeros(dp_arival, dtype=this_rev_speech.dtype))) this_speech = np.atleast_2d(this_speech) this_rev_speech = np.atleast_2d(this_rev_speech) this_speech_all.append(this_speech) this_rev_speech_all.append(this_rev_speech) offsets.append(offset) if save_speech_examples >= save_counter: save_names = [ basename(this_wav_file).replace('.wav', '') + '_air_' + str(offset) + '.wav', basename(this_wav_file).replace('.wav', '') + '_clean_speech_' + str(offset) + '.wav', basename(this_wav_file).replace('.wav', '') + '_rev_speech_' + str(offset) + '.wav' ] for examples in range(len(save_names)): save_names[examples] = base_examples_dir + save_names[ examples] write_wav(save_names[0], this_fs, airs[ch_id, :]) write_wav(save_names[1], this_fs, this_speech.flatten()) write_wav(save_names[2], this_fs, this_rev_speech.flatten()) save_counter += 1 this_speech = np.concatenate(this_speech_all, axis=0) this_rev_speech = np.concatenate(this_rev_speech_all, axis=0) if not this_fs == fs: raise AssertionError('Your sampling rates are not consistent') if i > 0: ids = np.concatenate((ids, names)) else: ids = names if max_air_read is not None: airs = airs[:, 0:max_air_read_samples] if False and speech_files is not None: print("Got " + str(airs.shape)) airs = resample_op(airs) if airs.ndim < 2: airs = np.atleast_2d(airs) # print('Done resampling') if i > 0: if x.shape[1] < airs.shape[1]: npads = -x.shape[1] + airs.shape[1] x = np.concatenate((x, np.zeros( (x.shape[0], npads)).astype(x.dtype)), axis=1) x = np.concatenate((x, airs), axis=0) else: if x.shape[1] > airs.shape[1]: npads = x.shape[1] - airs.shape[1] airs = np.concatenate( (airs, np.zeros( (airs.shape[0], npads)).astype(airs.dtype)), axis=1) x.resize((x.shape[0] + airs.shape[0], x.shape[1]), refcheck=False) x[-airs.shape[0]:, :] = np.array(airs) if speech_files is not None: if x_speech.shape[1] < this_speech.shape[1]: npads = -x_speech.shape[1] + this_speech.shape[1] x_speech = np.concatenate( (x_speech, np.zeros((x_speech.shape[0], npads)).astype( x_speech.dtype)), axis=1) x_speech = np.concatenate((x_speech, this_speech), axis=0) else: if x_speech.shape[1] > this_speech.shape[1]: npads = x_speech.shape[1] - this_speech.shape[1] this_speech = np.concatenate( (this_speech, np.zeros((this_speech.shape[0], npads)).astype( this_speech.dtype)), axis=1) x_speech.resize((x_speech.shape[0] + this_speech.shape[0], x_speech.shape[1]), refcheck=False) x_speech[-this_speech.shape[0]:, :] = this_speech if x_rev_speech.shape[1] < this_rev_speech.shape[1]: npads = -x_rev_speech.shape[1] + this_rev_speech.shape[1] x_rev_speech = np.concatenate( (x_rev_speech, np.zeros( (x_rev_speech.shape[0], npads)).astype( x_rev_speech.dtype)), axis=1) x_rev_speech = np.concatenate( (x_rev_speech, this_rev_speech), axis=0) else: if x_rev_speech.shape[1] > this_rev_speech.shape[1]: npads = x_rev_speech.shape[1] - this_rev_speech.shape[1] this_rev_speech = np.concatenate( (this_rev_speech, np.zeros( (this_rev_speech.shape[0], npads)).astype( this_rev_speech.dtype)), axis=1) x_rev_speech.resize( (x_rev_speech.shape[0] + this_rev_speech.shape[0], x_rev_speech.shape[1]), refcheck=False) x_rev_speech[ -this_rev_speech.shape[0]:, :] = this_rev_speech else: x = np.array(airs) if speech_files is not None: x_speech = np.array(this_speech) x_rev_speech = np.array(this_rev_speech) if save_speech_associations: from utils_base import run_command df = pd.DataFrame({ 'air': wav_files, 'speech': np.array(speech_files)[associations] if given_associations is None else given_associations['speech'], 'offsets': offsets if given_associations is None else given_associations['offsets'] }) df.to_csv(filename_associations, index=False) print('Saved: ' + filename_associations + ('' if given_associations is None else ' which was created from the given associations')) if copy_associations_to is not None: run_command('cp ' + filename_associations + ' ' + copy_associations_to) print('Saved: ' + copy_associations_to) if fs is not None: print('Got ' + str(x.shape[0]) + ' AIRs of duration ' + str(x.shape[1] / float(fs))) else: print('Got ' + str(x.shape[0]) + ' AIRs of length ' + str(x.shape[1])) if speech_files is not None: proc_data = x_rev_speech else: proc_data = x if drop_speech: x_rev_speech = [] x_speech = [] x = [] if no_fex: x_out = None print('Skipping feature extraction') else: x_out = data_post_proc(np.array(proc_data), forced_fs, start_at_max, framesize, get_pow_spec, max_air_len, wavform_logpow) print('Left with ' + str(x_out.shape) + ' AIR features data ') ids = ids.astype(str) wrote_h5 = False if write_cached_latest: try: hf = File(latest_file, 'w') if no_fex: hf.create_dataset('x', data=[]) else: hf.create_dataset('x', data=x_out) hf.create_dataset('y', data=[]) hf.create_dataset('ids', data=ids) hf.create_dataset('class_names', data=[]) hf.create_dataset('airs', data=x) hf.create_dataset('utts', data=utt_per_env) if speech_files is not None: hf.create_dataset('clean_speech', data=x_speech) hf.create_dataset('rev_names', data=x_rev_speech) else: hf.create_dataset('clean_speech', data=[]) hf.create_dataset('rev_names', data=[]) hf.close() wrote_h5 = True print('Wrote: ' + str(latest_file)) except IOError as ME: print('Cache writing failed with ' + str(ME.message)) if (not wrote_h5) and as_hdf5_ds: raise AssertionError('Could not provide data in correct format') if as_hdf5_ds: hf = File(latest_file, 'r') x_out = hf['x'] ids = hf['ids'] x = hf['airs'] x_speech = hf['clean_speech'] x_rev_speech = hf['rev_names'] # hf.close() return (x_out, None), ids, None, (x, x_speech, x_rev_speech), utt_per_env
'_result_modeled_air_baselines.pdf') score_names, scores = acenvmodel.get_eval_scores(verbose=True) acenvmodel.plot_modeling_results(saveloc=results_dir + '/' + the_name + '_result_modeled_air_hat.pdf', interactive=interactive) if not early_only: rir = acenv.impulse_response[:, do_channel] rir_hat = acenvmodel.air_reconstructed_from_model.flatten() try: fs_speech, s = wavfile.read(speech_loc) except IOError as ME: print('Could not read speech file ' + speech_loc + ' with: ' + ME.message) exit(0) s = (s[0:fs_speech * 10].astype('float128') / s[0:fs_speech * 10].max()).astype(float) if not model_fs == fs_speech: s = my_resample(s, fs_speech, model_fs) rev = np.convolve(s, rir) rev_hat = np.convolve(s, rir_hat) def write_the_wav(filename, x): write_wav(filename, model_fs, x) write_the_wav(results_dir + '/' + the_name + '_clean.wav', s) write_the_wav(results_dir + '/' + the_name + '_rir_hat.wav', rir_hat) write_the_wav(results_dir + '/' + the_name + '_rir.wav', rir) write_the_wav(results_dir + '/' + the_name + '_rev.wav', rev) write_the_wav(results_dir + '/' + the_name + '_rev_hat.wav', rev_hat)
def __init__(self, name='', filename='', samples=None, sampling_freq=0, keep_channel=None, max_allowed_silence=0.001, is_simulation=None, silent_samples_offset=False, matlab_engine=None): """ Args: name: Used as the label for the object filename: The filename for the measured/simualted AIR samples: The samples of the measured/simulated AIR sampling_freq (int): The sampling frequency for the AIR """ self.name = name self.sampling_freq = 0 self.impulse_response = np.array([]) self.room_name = None self.room_type = None self.room_dimensions = (None, None, None) self.rec_position = [(None, None, None)] self.src_position = (None, None, None) self.is_simulation = is_simulation self.from_database = None self.receiver_name = None self.receiver_config = None self.known_room = False self.filename = filename self.nchannels = 0 max_allowed_silence_samples = int( np.ceil(max_allowed_silence * self.sampling_freq)) if (len(filename) == 0) & (samples is None): raise NameError(getfname() + ':FilenameOrSamplesRequired') if samples is not None: self.impulse_response = samples if sampling_freq <= 0: raise AssertionError('SamplingFreqCannotBe0orNegative') self.sampling_freq = sampling_freq if keep_channel is not None: self.impulse_response = self.get_channel(keep_channel) else: self.sampling_freq, self.impulse_response = wavfile.read( self.filename) if keep_channel is not None: self.impulse_response = self.get_channel(keep_channel) self.impulse_response = self.impulse_response.astype( float) / np.max(np.abs(self.impulse_response)) if sampling_freq > 0: self.impulse_response = np.array( my_resample(np.array(self.impulse_response), self.sampling_freq, sampling_freq, matlab_eng=matlab_engine)) self.sampling_freq = sampling_freq try: if self.impulse_response.ndim == 1: self.impulse_response = column_vector(self.impulse_response) except AttributeError: pass if len(filename) > 0: self.add_room_info() self.add_receiver_info() if self.impulse_response.ndim < 2: self.nchannels = 1 else: self.nchannels = self.impulse_response.shape[1] scale_by = float(abs(self.impulse_response).max()) if scale_by > 0: self.impulse_response = self.impulse_response / scale_by if self.impulse_response is not None: start_sample = self.impulse_response.shape[0] for i in range(self.impulse_response.shape[1]): start_sample = min( start_sample, max( 0, np.nonzero(self.impulse_response[:, i])[0][0] - max_allowed_silence_samples)) if start_sample > 0 and silent_samples_offset: self.impulse_response = self.impulse_response[start_sample:, :] print('Offsetted AIR by ' + str(start_sample) + ' samples')
args = parser.parse_args() print('Args given : ' + str(args)) if args.nodisplay: use('Agg') import matplotlib.pyplot as plt doing_e2e = args.e2e file_loc = args.file_loc speech_file = args.speech_file savelocation = args.saveloc + '/gan_acenv_' + get_timestamp() + '/' if speech_file is not None: fs, speech_samples = wavfile.read(speech_file) speech_samples = my_resample(speech_samples, fs, global_fs) speech_samples = speech_samples[0:int(global_fs * 6)] speech_samples = speech_samples / float(np.abs(speech_samples).max()) else: speech_samples = None if doing_e2e: from fe_utils import get_ace_xy from utils_spaudio import align_max_samples if file_loc is None: file_loc = '../results_dir/ace_h5_info.h5' model_framesize = 64 max_air_len_import = air_len wavform_logpow = False get_pow_spec = False