def load_datasets(self): datasets = [] dataframes = [] for subset in ["Train", "Val", "Test"]: # load spectra dir_spectra = os.path.join(self._dir_root_set, "Dataset", subset, "Spectra") files_spectra = retrieve_files(dir_spectra) if hasattr(self, "_states"): # load states dir_states = os.path.join( self._dir_root_set, "Dataset", subset, "States" ) files_states = retrieve_files(dir_states) # load data into torch set, dump info into df if not hasattr(self, "_states"): dataset, df = self._load_dataset(files_spectra) else: dataset, df = self._load_dataset(files_spectra, files_states) datasets.append(dataset) df.insert(0, "Set", subset) dataframes.append(df) self._train_set, self._val_set, self._test_set = datasets dataframe = pd.concat(dataframes) return dataframe
def plot_features(self, subset, states=None, idx=0): # plot all states by default if states is None: states = [ "rpm", "rpm_delta", "cmd", "cmd_delta", "height", "vel", "acc", "angles", "rates", ] # set up figure n_rows = len(states) + 1 fig = plt.figure(figsize=(8, 3 * n_rows), constrained_layout=True) gs = fig.add_gridspec(n_rows, 1) # load spectrum dir_spectrum = os.path.join(self._dir_root_set, "Dataset", subset, "Spectra") files_spectrum = retrieve_files(dir_spectrum) Z = pd.read_csv(files_spectrum[idx], header=None).to_numpy() # set plot title if self._feature["feature"] == "Mfcc": title = "MFCC (%d bins)" % self._feature["mfc_coefficients"] elif self._feature["feature"] == "Stft": title = "Spectrogram (%d frequency bins)" % self._feature[ "frequency_bins"] else: title = "%s-spectrogram (%d frequency bins)" % ( self._feature["feature"], self._feature["frequency_bins"], ) # plot spectrum ax = fig.add_subplot(gs[0]) ax.set_title(title) ax = ph.plot_spectrum(Z, self._feature) # load states dir_states = os.path.join(self._dir_root_set, "Dataset", subset, "States") files_states = retrieve_files(dir_states) S = pd.read_csv(files_states[idx], header=None).to_numpy() # plot relevant states colors = ["orangered", "darkolivegreen", "steelblue", "goldenrod"] for i, state_name in enumerate(states): ax = fig.add_subplot(gs[1 + i]) ax = ph.plot_states_synchronized(S, state_name, self._feature, colors) plt.show() return fig
def create_mixed_test_set(self, noise_ratio, seed=42, overwrite=False): # make state dir. if it does not exist (do not overwrite, ever) dir_out = os.path.join(self._dir_root_ac, "Features", "Mixed", "States", "Test") if not os.path.exists(dir_out): os.makedirs(dir_out) # set output directory (spectra) dir_out = os.path.join( self._dir_root_ac, "Features", "Mixed", "Spectra", "Ratio_%.2f" % noise_ratio, "Test", ) # check if it exists or should be overwritten if os.path.exists(dir_out) and not overwrite: return refresh_directory(dir_out) # retrieve 'clean' spectra (ESC-50) dir_clean = os.path.join(self._dir_root_ac, "Features", "Clean", "Spectra", "Test") files_clean = retrieve_files(dir_clean) # retrieve ego-noise spectra (MAV) dir_noise = os.path.join(self._dir_root_enp, "Dataset", "Test", "Spectra") files_noise = retrieve_files(dir_noise) # retrieve states belonging to ego-noise dir_states = os.path.join(self._dir_root_enp, "Dataset", "Test", "States") files_states = retrieve_files(dir_states) # generate a list of directory-specific 'seeds' from the given seed # to preserve reproducible randomness while multiprocessing dir_seed = seed + len("Test") np.random.seed(dir_seed) seeds = np.random.randint(0, 10 * len(files_clean), len(files_clean)) # set up multiprocessing to mix audio part = partial( self._create_mixed_spectrum, files_noise=files_noise, files_states=files_states, ratio=noise_ratio, subset="Test", ) with Pool(processes=os.cpu_count() - 1) as pool: pool.starmap(part, list(zip(files_clean, seeds)))
def create_clean_dataset(self, augmentations=None, overwrite=False): """Create (extract and export) the clean dataset from the audio data. Keyword argument: overwrite -- whether to overwrite existing data (default: False). """ # get list of dataset directory names if augmentations is not None: subsets = ["Train", "Val", "Test"] subsets += ["Train " + aug for aug in augmentations] else: # extract all sets available subsets = sorted(os.listdir(self._dir_root_audio)) for subset in subsets: # output directory (spectra) dir_output = os.path.join(self._dir_root_ac, "Features", "Clean", "Spectra", subset) # only extract feature if set does not exist or should be overwritten if os.path.exists(dir_output) and not overwrite: continue refresh_directory(dir_output) # get audio files (ESC-50) dir_input = os.path.join(self._dir_root_audio, subset) files_input = retrieve_files(dir_input) # multiprocessing part = partial(self._export_spectrum, dir_out=dir_output) with Pool(processes=os.cpu_count() - 1) as pool: pool.map(part, files_input)
def extract_spectra(self, offset=50, scaling=80): # Loop over subsets for subset in ["Train", "Val", "Test"]: # Get audio files dir_audio = os.path.join(self._dir_root, "Ego-Noise Prediction", "Dataset", subset, "Audio") files_audio = retrieve_files(dir_audio) # directory for the unsynchronized spectra dir_output = os.path.join(self._dir_root_set, "Unsynchronized", subset, "Spectra") # Refresh directory refresh_directory(dir_output) # Loop through files in set for f in files_audio: # Extract spectrum Z = fh.extract_spectrum(f, self._feature) # Scale spectrum Z += offset Z /= scaling # Save to appropriate directory fn = os.path.split(f)[-1].replace(".wav", ".csv") fp = os.path.join(dir_output, fn) pd.DataFrame(Z).to_csv(fp, index=False, header=False) print_verbose( self.super_verbose, "Finished extracting feature for '%s' set." % subset, )
def classify_mismatched_test_set(self, model): # load spectra dir_spectra = os.path.join(self._dir_root_set, "Dataset", "Test", "Spectra") files_spectra = retrieve_files(dir_spectra) # load data into torch set, dump info into df if not hasattr(self, "_states"): dataset, df = self._load_dataset(files_spectra) else: # load states dir_states = os.path.join(self._dir_root_set, "Dataset", "Test", "States") files_states = retrieve_files(dir_states) dataset, df = self._load_dataset(files_spectra, files_states) # predict data df = self._classify_set(model, dataset, df) return df
def save_network_output(self, model, dir_model, subset, plot=True): # refresh the output directories output_subdirs = ["Original", "Predicted", "Residual"] for subdir in output_subdirs: refresh_directory(os.path.join(dir_model, "Output", subset, subdir)) # load the original files (states, spectra) in the subset dir_states = os.path.join(self._dir_root_set, "Dataset", subset, "States") files_states = retrieve_files(dir_states) dir_spectra = os.path.join(self._dir_root_set, "Dataset", subset, "Spectra") files_spectra = retrieve_files(dir_spectra) for i in range(len(files_states)): # load original spectra and cut-off context original = pd.read_csv(files_spectra[i], header=None).to_numpy() if self._states["context_frames"] > 0: original = original[:, self._states["context_frames"]:] # predict spectra from states file predicted = self._predict(model, files_states[i], original.shape) # compute residual residual = original - predicted # plot if desired if plot: self._plot_model_output(original, predicted, residual) # save output fn = os.path.split(files_states[i])[-1] # target filename output_spectra = [original, predicted, residual] for spectrum, subdir in zip(output_spectra, output_subdirs): # save spectrum dir_out = os.path.join(dir_model, "Output", subset, subdir) pd.DataFrame(spectrum).to_csv(os.path.join(dir_out, fn), index=False, header=False)
def _load_data(self, dir_split): # load N files files_X = retrieve_files(os.path.join(dir_split, "States")) # input files_Y = retrieve_files(os.path.join(dir_split, "Spectra")) # output # load states: NxTxS data_X = [ pd.read_csv(f, header=None).to_numpy().transpose() for f in files_X ] # extract only relevant states data_X = [ fh.extract_relevant_states(data, self._states["states"]) for data in data_X ] # load spectra: NxTxF data_Y = [ pd.read_csv(f, header=None).to_numpy().transpose() for f in files_Y ] if self._states["context_frames"] > 0: # add context to the dataset: (NxTxS, NxTxF) -> (NxT-CxCxS, NxT-CxCxF) data_X, data_Y = list( zip(*[ self._add_context(dX, dY) for dX, dY in zip(data_X, data_Y) ])) else: # add placeholder dim. for X: NxTxS -> NxTx1xS data_X = [np.expand_dims(X, 1) for X in data_X] # concatenate N and T axes to get 3D set data_X = np.concatenate(data_X, axis=0) data_Y = np.concatenate(data_Y, axis=0) # convert to torch dataset X = torch.from_numpy(data_X).float() Y = torch.from_numpy(data_Y).float() dataset = torch.utils.data.TensorDataset(X, Y) return dataset
def synchronize_data(self, skip_takeoff=True): # Loop over subsets for subset in ["Train", "Val", "Test"]: # list unsynchronized spectra dir_spectra = os.path.join(self._dir_root_set, "Unsynchronized", subset, "Spectra") files_spectra = retrieve_files(dir_spectra) # list unsynchronized states dir_states = os.path.join(self._dir_root_set, "Unsynchronized", subset, "States") files_states = retrieve_files(dir_states) # set the root output directory and refresh the output directories dir_root_output = os.path.join(self._dir_root_set, "Dataset", subset) refresh_directory(os.path.join(dir_root_output, "Spectra")) refresh_directory(os.path.join(dir_root_output, "States")) # synchronize each pair of files for i in range(len(files_spectra)): self._synchronize_pair(files_spectra[i], files_states[i], dir_root_output, skip_takeoff)
def plot_spectra_denoised( self, noise_ratio, categories=None, idx=0, plot_clean=True, plot_noise=True, plot_mixed=True, plot_predicted=True, enp_model_index=0, ): """Plot a selection of spectra of a denoised test set. Keyword arguments: set_name -- set to be plotted (e.g. 'Test'), noise_ratio -- the ratio of the noise compared to the signal, categories -- iterable containing the original categories (airplane, engine, etc.) to be plotted (default: all), features -- iterable containing the features to be plotted (default: all), idx -- index of the feature selected for plotting (default: 0). plot_clean -- whether to plot the clean spectra (default: True), plot_noise -- whether to plot the noise-only spectra (default: True) plot_mixed -- whether to plot the mixed-only spectra (default: True) plot_predicted: whether to plot the predicted ego-noise spectra (default: True) enp_model_index -- selects which ego-noise predictor (enp) to use for the prediction, if multiple are available (default: 0), colorbar -- whether to plot a colorbar next to each plotted spectrum (default: False) Example usage: plot_spectra_denoised(1.0, ['airplane', 'helicopter'], ['Stft'], plot_noise=False, plot_predicted=False) This plots the clean, mixed and denoised spectra of the first spectrogram belonging to the airplane and helicopter categories within the test set with a noise ratio of 1.00. """ # define directories dir_denoised = os.path.join( self._dir_root_ac, "Features", "Denoised", "Spectra", "Ratio_%.2f" % noise_ratio, "Test", ) if plot_clean: dir_clean = os.path.join( self._dir_root_ac, "Features", "Clean", "Spectra", "Test", ) if plot_noise: # load noise files dir_noise = os.path.join( self._dir_root_enp, "Dataset", "Test", "Spectra", ) files_noise = retrieve_files(dir_noise) if plot_mixed: dir_mixed = os.path.join( self._dir_root_ac, "Features", "Mixed", "Spectra", "Ratio_%.2f" % noise_ratio, "Test", ) if plot_predicted: # select appropriate model dir_model_enp = sorted( os.listdir(os.path.join(self._dir_root_enp, "Models")))[enp_model_index] dir_predicted = os.path.join( self._dir_root_enp, "Models", dir_model_enp, "Output", "Test", "Predicted", ) files_predicted = retrieve_files(dir_predicted) # get model context for offset context = int(dir_model_enp.split("_")[-1][-1]) if plot_noise or plot_predicted: # also load states files to recover noise indices dir_states = os.path.join( self._dir_root_ac, "Features", "Mixed", "States", "Test", ) files_states = retrieve_files(dir_states) # plot all categories if not given filenames = [f for f in sorted(os.listdir(dir_mixed))] if categories is None: file_categories = [f.split("_")[0] for f in filenames] categories = sorted(list(set(file_categories))) # setup figure, subfigures n_categories = len(categories) n_variants = (1 + int(plot_clean) + int(plot_noise) + int(plot_mixed) + int(plot_predicted)) fig = plt.figure( figsize=(6 * n_categories, 4 * n_variants), constrained_layout=True, ) gs = fig.add_gridspec(n_variants * 2, n_categories) # loop through categories for i, cat in enumerate(categories): # get filename fn = [f for f in filenames if f.split("_")[0] == cat][idx] # load denoised file file_denoised = os.path.join(dir_denoised, fn) D = pd.read_csv(file_denoised, header=None).to_numpy() # store spectra and titles spectra = [] titles = [] if plot_clean: # load clean file file_clean = os.path.join(dir_clean, fn) C = pd.read_csv(file_clean, header=None).to_numpy() # add to lists spectra.append(C) titles.append("'%s': Clean Sound (%s)" % (fn, self._feature["feature"])) if plot_noise or plot_predicted: # get filename of states file belonging to noise file fn_states = os.path.split([ f for f in files_states if fn.replace(".csv", "") in f ][0])[-1] # get file idx, frame idx from states filename for noise file idx_file = int(fn_states.split("_")[-2]) idx_frame = int(fn_states.split("_")[-1].split(".")[0]) if plot_noise: # load noise file from idx_file file_noise = files_noise[idx_file] # get noise fragment from idx_frame N = pd.read_csv(file_noise, header=None).to_numpy()[:, idx_frame:idx_frame + D.shape[1]] # add deltas to noise fragment N = np.concatenate( (N, librosa.feature.delta(N, mode="mirror")), axis=0) # add to lists spectra.append(N) titles.append("MAV Noise (%s)" % self._feature["feature"]) if plot_mixed: # load mixed file file_mixed = os.path.join(dir_mixed, fn) M = pd.read_csv(file_mixed, header=None).to_numpy() # add to lists spectra.append(M) titles.append("'%s': Noisy Mix (%s)" % (fn, self._feature["feature"])) if plot_predicted: # load predicted file file_pred = files_predicted[idx_file] P = pd.read_csv(file_pred, header=None).to_numpy()[:, idx_frame - context:idx_frame - context + D.shape[1], ] # add deltas to predicted fragment P = np.concatenate( (P, librosa.feature.delta(P, mode="mirror")), axis=0) spectra.append(P) titles.append("Predicted MAV Noise (%s)" % self._feature["feature"]) # add denoised file to end of lists spectra.append(D) titles.append("'%s': Denoised Sound (%s)" % (fn, self._feature["feature"])) for j, Z in enumerate(spectra): # plot spectrum ax = fig.add_subplot(gs[2 * j, i]) ph.plot_spectrum(Z[:Z.shape[0] // 2], self._feature) ax.set_title(titles[j]) # plot delta-spectrum ax = fig.add_subplot(gs[2 * j + 1, i]) ph.plot_spectrum(Z[Z.shape[0] // 2:], self._feature, colormap="coolwarm")
def plot_spectra_mixed( self, set_name, noise_ratio, categories=None, idx=0, plot_clean=True, plot_noise=True, ): """Plot a selection of spectra of a mixed dataset. Keyword arguments: set_name -- set to be plotted (e.g. 'Test'), noise_ratio -- the ratio of the noise compared to the signal, categories -- iterable containing the original categories (airplane, engine, etc.) to be plotted (default: all), features -- iterable containing the features to be plotted (default: all), idx -- index of the feature selected for plotting (default: 0). plot_clean -- whether to plot the clean spectra (default: True), plot_noise -- whether to plot the noise-only spectra (default: True) colorbar -- whether to plot a colorbar next to each plotted spectrum (default: False) Example usage: plot_spectra_mixed('Train', )... """ # define directories dir_mixed = os.path.join( self._dir_root_ac, "Features", "Mixed", "Spectra", "Ratio_%.2f" % noise_ratio, set_name, ) if plot_clean: dir_clean = os.path.join(self._dir_root_ac, "Features", "Clean", "Spectra", set_name) if plot_noise: # load noise files dir_noise = os.path.join(self._dir_root_enp, "Dataset", set_name.split(" ")[0], "Spectra") files_noise = retrieve_files(dir_noise) # also load states files to recover noise indices dir_states = os.path.join(self._dir_root_ac, "Features", "Mixed", "States", set_name) files_states = retrieve_files(dir_states) # plot all categories if not given filenames = [f for f in sorted(os.listdir(dir_mixed))] if categories is None: file_categories = [f.split("_")[0] for f in filenames] categories = sorted(list(set(file_categories))) # setup figure, subfigures n_categories = len(categories) n_variants = 1 + int(plot_clean) + int(plot_noise) fig = plt.figure( figsize=(6 * n_categories, 4 * n_variants), constrained_layout=False, ) gs = fig.add_gridspec(n_variants * 2, n_categories) # loop through categories for i, cat in enumerate(categories): # get filename fn = [f for f in filenames if f.split("_")[0] == cat][idx] # load mixed file file_mixed = os.path.join(dir_mixed, fn) M = pd.read_csv(file_mixed, header=None).to_numpy() # store spectra and titles spectra = [] titles = [] if plot_clean: # load clean file file_clean = os.path.join(dir_clean, fn) C = pd.read_csv(file_clean, header=None).to_numpy() # add to lists spectra.append(C) titles.append("'%s': Clean Sound (%s)" % (fn, self._feature["feature"])) if plot_noise: # get filename of states file belonging to noise file fn_states = os.path.split([ f for f in files_states if fn.replace(".csv", "") in f ][0])[-1] # get file idx, frame idx from states filename for noise file idx_file = int(fn_states.split("_")[-2]) idx_frame = int( fn_states.split("_")[-1].split(".")[0]) # omit .csv # load noise file from idx_file file_noise = files_noise[idx_file] # get noise fragment from idx_frame N = pd.read_csv(file_noise, header=None).to_numpy()[:, idx_frame:idx_frame + M.shape[1]] # add deltas to noise fragment N = np.concatenate( (N, librosa.feature.delta(N, mode="mirror")), axis=0) # add to lists spectra.append(N) titles.append("MAV Noise (%s)" % self._feature["feature"]) # add mixed file to end of lists spectra.append(M) titles.append("'%s': Noisy Mix (%s)" % (fn, self._feature["feature"])) for j, Z in enumerate(spectra): # plot spectrum ax = fig.add_subplot(gs[2 * j, i]) ph.plot_spectrum(Z[:Z.shape[0] // 2], self._feature) ax.set_title(titles[j]) # plot delta-spectrum ax = fig.add_subplot(gs[2 * j + 1, i]) ph.plot_spectrum(Z[Z.shape[0] // 2:], self._feature, colormap="coolwarm")
def create_denoised_train_augmentation_set(self, noise_ratio, enp_model_index=0, overwrite=False): """Denoise the mixed spectra to obtain a single denoised training set. Keyword arguments: noise_ratio -- the ratio of the noise compared to the signal (default: 1.0), enp_model_index -- selects which ego-noise predictor (enp) to use for denoising, if multiple are available (default: 0), overwrite -- whether to overwrite existing data (default: False). """ # set output directory dir_out = os.path.join( self._dir_root_ac, "Features", "Clean", "Spectra", "Train Denoised", ) # check if it exists or should be overwritten if os.path.exists(dir_out) and not overwrite: return # exit refresh_directory(dir_out) # get directory containing noise model dir_model_enp = sorted( os.listdir(os.path.join(self._dir_root_enp, "Models")))[enp_model_index] # get context frames !!! get from config instead... model_context = int(dir_model_enp.split("_")[-1][-1]) # load files containing predicted noise dir_predicted = os.path.join( self._dir_root_enp, "Models", dir_model_enp, "Output", "Train", "Predicted", ) files_predicted = retrieve_files(dir_predicted) # load states files dir_states = os.path.join(self._dir_root_ac, "Features", "Mixed", "States", "Train") files_states = retrieve_files(dir_states) # load mixed files dir_mix = os.path.join( self._dir_root_ac, "Features", "Mixed", "Spectra", "Ratio_%.2f" % noise_ratio, "Train", ) files_mix = retrieve_files(dir_mix) # set up pool part = partial( self._create_denoised_feature, files_predicted=files_predicted, files_states=files_states, context=model_context, ratio=noise_ratio, dir_out=dir_out, ) with Pool(processes=os.cpu_count() - 1) as pool: pool.map(part, files_mix) # add extension to filenames to match other augmentations for fn in sorted(os.listdir(dir_out)): fn_new = "%s-dn.csv" % fn.split(".")[0] os.rename(os.path.join(dir_out, fn), os.path.join(dir_out, fn_new))
def create_denoised_test_set(self, noise_ratio, enp_model_index=0, overwrite=False): """Denoise the mixed spectra to obtain denoised test sets. Keyword arguments: noise_ratios -- iterable containing the ratio of the noise compared to the signal (default: [0.5, 1.0]), enp_model_index -- selects which ego-noise predictor (enp) to use for denoising, if multiple are available (default: 0), overwrite -- whether to overwrite existing data (default: False). Note: only noise ratios that were used for the mixing of noisy data (via create_noisy_dataset) can be used for denoising. """ # set output directory dir_out = os.path.join( self._dir_root_ac, "Features", "Denoised", "Spectra", "Ratio_%.2f" % noise_ratio, "Test", ) # check if it exists or should be overwritten if os.path.exists(dir_out) and not overwrite: return refresh_directory(dir_out) # get directory containing noise model dir_model_enp = sorted( os.listdir(os.path.join(self._dir_root_enp, "Models")))[enp_model_index] # get context frames !!! get from config instead... model_context = int(dir_model_enp.split("_")[-1][-1]) # load files containing predicted noise dir_predicted = os.path.join( self._dir_root_enp, "Models", dir_model_enp, "Output", "Test", "Predicted", ) files_predicted = retrieve_files(dir_predicted) # load states files dir_states = os.path.join(self._dir_root_ac, "Features", "Mixed", "States", "Test") files_states = retrieve_files(dir_states) # load mixed files dir_mix = os.path.join( self._dir_root_ac, "Features", "Mixed", "Spectra", "Ratio_%.2f" % noise_ratio, "Test", ) files_mix = retrieve_files(dir_mix) # set up pool part = partial( self._create_denoised_feature, files_predicted=files_predicted, files_states=files_states, context=model_context, ratio=noise_ratio, dir_out=dir_out, ) with Pool(processes=os.cpu_count() - 1) as pool: pool.map(part, files_mix)
def split_dataset(self, train_test_ratio=0.8, train_val_ratio=0.8, overwrite=False): """Split the dataset into a training, validation and test subset. Keyword arguments: train_test_ratio -- ratio of the training set over the complete, set, the remainder will be assigned to the test subset (default: 0.8), train_val_ratio -- ratio of the actual training set over the training set, the remainder will be assigned to the validation subset (default: 0.8), overwrite -- whether to overwrite existing data (default: False). """ # directories dir_input = os.path.join(self._dir_root, "Aircraft Classification", "Audio", "Full") dir_root_output = os.path.join(self._dir_root, "Aircraft Classification", "Audio") # check if data should be overwritten if it exists if os.path.exists(os.path.join(dir_root_output, "Train")) and not overwrite: print_verbose( self.verbose, "Dataset already exists and should not be overwritten.") return # refresh the output directories subdirs = ["Train", "Val", "Test"] for subdir in subdirs: refresh_directory(os.path.join(dir_root_output, subdir)) # read files into array for easy slicing files = np.array(retrieve_files(dir_input)) # get categories file_categories = np.array( [os.path.split(f)[-1].split("_")[0] for f in files]) categories = np.unique(file_categories) files_per_category = len(files) // len(categories) # get train, val, test indices per category train_idcs, test_idcs = train_test_split(np.arange(files_per_category), train_size=train_test_ratio, random_state=42) train_idcs, val_idcs = train_test_split(train_idcs, train_size=train_val_ratio, random_state=42) print_verbose( self.verbose, "Split per category (Train, Val, Test): (%d, %d, %d)" % (len(train_idcs), len(val_idcs), len(test_idcs)), ) # extract train, val, test files using indices and export to subdirs indices = [train_idcs, val_idcs, test_idcs] for idcs, subdir in zip(indices, subdirs): files_set = [ f for f in files if int(os.path.split(f)[-1].split("_")[-1].split(".")[0]) - 1 in idcs ] for file in files_set: dest = os.path.join(dir_root_output, subdir, os.path.split(file)[-1]) shutil.copyfile(file, dest) # remove the now redundant 'Full' input directory shutil.rmtree(dir_input)
def create_mixed_dataset(self, noise_ratio, augmentations=None, seed=42, overwrite=False): """Mix clean spectra with ego-noise spectra at the specified ratios. Keyword arguments: noise_ratios -- iterable containing the ratio of the noise compared to the signal (default: [0.5, 1.0]), overwrite -- whether to overwrite existing data (default: False). Note that the mixing is entirely random. When generating new features (i.e. deciding to extract both 'Mel' and 'Stft' features instead of only 'Stft'), it is recommended to set overwrite to True to ensure mixing consistency between existing and new features. """ # get list of dataset directory names if augmentations is not None: subsets = ["Train", "Val", "Test"] subsets += ["Train " + aug for aug in augmentations] else: # extract all sets available subsets = os.listdir(self._dir_root_audio) # loop through sets in dataset for subset in subsets: # retrieve 'clean' spectra (ESC-50) dir_clean = os.path.join(self._dir_root_ac, "Features", "Clean", "Spectra", subset) files_clean = retrieve_files(dir_clean) # retrieve ego-noise spectra (MAV) subset_enp = subset.split(" ")[ 0] # use 'Train' set for augmentations dir_noise = os.path.join(self._dir_root_enp, "Dataset", subset_enp, "Spectra") files_noise = retrieve_files(dir_noise) # retrieve states belonging to ego-noise dir_states = os.path.join(self._dir_root_enp, "Dataset", subset_enp, "States") files_states = retrieve_files(dir_states) # generate a list of directory-specific 'seeds' from the given seed # to preserve reproducible randomness while multiprocessing dir_seed = seed + len(subset) np.random.seed(dir_seed) seeds = np.random.randint(0, 10 * len(files_clean), len(files_clean)) # make state dir. if it does not exist (do not overwrite, ever) dir_out = os.path.join(self._dir_root_ac, "Features", "Mixed", "States", subset) if not os.path.exists(dir_out): os.makedirs(dir_out) # set output directory (spectra) dir_out = os.path.join( self._dir_root_ac, "Features", "Mixed", "Spectra", "Ratio_%.2f" % noise_ratio, subset, ) # check if it exists or should be overwritten if os.path.exists(dir_out) and not overwrite: continue # skip set refresh_directory(dir_out) # set up multiprocessing to mix audio part = partial( self._create_mixed_spectrum, files_noise=files_noise, files_states=files_states, ratio=noise_ratio, subset=subset, ) with Pool(processes=os.cpu_count() - 1) as pool: pool.starmap(part, list(zip(files_clean, seeds)))
def extract_states(self): # Loop over subsets for subset in ["Train", "Val", "Test"]: # Get states files dir_states = os.path.join(self._dir_root, "Ego-Noise Prediction", "Dataset", subset, "States") files_states = retrieve_files(dir_states) # Directory for the unsynchronized states dir_output = os.path.join(self._dir_root_set, "Unsynchronized", subset, "States") refresh_directory(dir_output) # Loop through files in set for f in files_states: # xyz in NED frame # Read in as dataframe df = pd.read_csv(f, header=0) # Add delta-rpm df["rpm_1_delta"] = np.diff(df["rpm_1"].to_numpy(), prepend=0) df["rpm_2_delta"] = np.diff(df["rpm_2"].to_numpy(), prepend=0) df["rpm_3_delta"] = np.diff(df["rpm_3"].to_numpy(), prepend=0) df["rpm_4_delta"] = np.diff(df["rpm_4"].to_numpy(), prepend=0) # Add delta-cmd df["cmd_thrust_delta"] = np.diff(df["cmd_thrust"].to_numpy(), prepend=0) df["cmd_roll_delta"] = np.diff(df["cmd_roll"].to_numpy(), prepend=0) df["cmd_pitch_delta"] = np.diff(df["cmd_pitch"].to_numpy(), prepend=0) df["cmd_yaw_delta"] = np.diff(df["cmd_yaw"].to_numpy(), prepend=0) # Prune horizontal position df.drop(columns=["pos_x", "pos_y"], inplace=True) # Negate vertical position to get height df.rename(columns={"pos_z": "height"}, inplace=True) df["height"] *= -1 # Replace north- and east velocities with magnitude (horizontal) df["vel_hor"] = np.sqrt(df["vel_x"]**2 + df["vel_y"]**2) df.drop(columns=["vel_x", "vel_y"], inplace=True) # Negate downwards velocity go get vertical velocity df.rename(columns={"vel_z": "vel_ver"}, inplace=True) df["vel_ver"] *= -1 # Replace north- and east accelerations with magnitude (horizontal) df["acc_hor"] = np.sqrt(df["acc_x"]**2 + df["acc_y"]**2) df.drop(columns=["acc_x", "acc_y"], inplace=True) # Negate downwards velocity go get vertical acceleration df.rename(columns={"acc_z": "acc_ver"}, inplace=True) df["acc_ver"] *= -1 # Re-order the frame cols = [ "delta_t", "rpm_1", "rpm_2", "rpm_3", "rpm_4", "rpm_1_delta", "rpm_2_delta", "rpm_3_delta", "rpm_4_delta", "cmd_thrust", "cmd_roll", "cmd_pitch", "cmd_yaw", "cmd_thrust_delta", "cmd_roll_delta", "cmd_pitch_delta", "cmd_yaw_delta", "height", "vel_hor", "vel_ver", "acc_hor", "acc_ver", "angle_phi", "angle_theta", "angle_psi", "rate_p", "rate_q", "rate_r", ] df = df[cols] # Export fn = os.path.split(f)[-1] df.to_csv(os.path.join(dir_output, fn), header=True, index=False)
def split_features( self, subset=None, augmentations=None, noise_set=None, noise_ratio=None ): # default 'noise' is no noise (clean) if noise_set is None: noise_set = "Clean" if subset is not None: if type(subset) == str: subset = [subset] subsets = subset else: # split 'Train', 'Val', 'Test' set if no specific subset is given if augmentations is not None: if type(augmentations) == str: augmentations = [augmentations] # add specific augmentation(s) to default sets subsets = ["Train", "Val", "Test"] subsets += ["Train " + a for a in augmentations] else: # use all available augmentations if noise_set == "Clean": subsets = os.listdir( os.path.join(self._dir_root_set, "Features", "Clean", "Spectra") ) else: subsets = os.listdir( os.path.join( self._dir_root_set, "Features", noise_set, "Spectra", "Ratio_%.2f" % noise_ratio, ) ) # root input directory spectra dir_root_spectra_in = os.path.join( self._dir_root_set, "Features", noise_set, "Spectra" ) if noise_set != "Clean": dir_root_spectra_in = os.path.join( dir_root_spectra_in, "Ratio_%.2f" % noise_ratio ) for subset in sorted(subsets): # load 5-second spectra belonging to categories dir_in_spectra = os.path.join(dir_root_spectra_in, subset) files_spectra = retrieve_files(dir_in_spectra) files_spectra = [ f for f in files_spectra if os.path.split(f)[-1].split("_")[0] in self._classification["categories"] ] # set output directory (augmentations i.e. 'Train Denoised' go in 'Train') dir_out_spectra = os.path.join( self._dir_root_set, "Dataset", subset.split(" ")[0], "Spectra" ) # refresh directories only for non-augmented sets if subset in ["Train", "Val", "Test"]: refresh_directory(dir_out_spectra) # split spectra part = partial(self._split_spectra, dir_output=dir_out_spectra) with Pool(processes=os.cpu_count() - 1) as pool: pool.map(part, files_spectra) # split the states in case of implicit denoising if hasattr(self, "_states"): # load states dir_in_states = os.path.join( self._dir_root_set, "Features", "Mixed", "States", subset ) files_states = retrieve_files(dir_in_states) files_states = [ f for f in files_states if os.path.split(f)[-1].split("_")[0] in self._classification["categories"] ] # refresh output directory only for non-augmented sets dir_out_states = os.path.join( self._dir_root_set, "Dataset", subset.split(" ")[0], "States" ) if subset in ["Train", "Val", "Test"]: refresh_directory(dir_out_states) # split states part = partial(self._split_states, dir_output=dir_out_states) with Pool(processes=os.cpu_count() - 1) as pool: pool.map(part, files_states) print_verbose( self.verbose, "Split %d files (%d categories) into %d files" % ( len(files_spectra), len(self._classification["categories"]), len(os.listdir(dir_out_spectra)), ), )