Exemplo n.º 1
0
    def load_datasets(self):

        datasets = []
        dataframes = []
        for subset in ["Train", "Val", "Test"]:
            # load spectra
            dir_spectra = os.path.join(self._dir_root_set, "Dataset", subset, "Spectra")
            files_spectra = retrieve_files(dir_spectra)

            if hasattr(self, "_states"):
                # load states
                dir_states = os.path.join(
                    self._dir_root_set, "Dataset", subset, "States"
                )
                files_states = retrieve_files(dir_states)

            # load data into torch set, dump info into df
            if not hasattr(self, "_states"):
                dataset, df = self._load_dataset(files_spectra)
            else:
                dataset, df = self._load_dataset(files_spectra, files_states)
            datasets.append(dataset)
            df.insert(0, "Set", subset)
            dataframes.append(df)

        self._train_set, self._val_set, self._test_set = datasets
        dataframe = pd.concat(dataframes)

        return dataframe
    def plot_features(self, subset, states=None, idx=0):

        # plot all states by default
        if states is None:
            states = [
                "rpm",
                "rpm_delta",
                "cmd",
                "cmd_delta",
                "height",
                "vel",
                "acc",
                "angles",
                "rates",
            ]

        # set up figure
        n_rows = len(states) + 1
        fig = plt.figure(figsize=(8, 3 * n_rows), constrained_layout=True)
        gs = fig.add_gridspec(n_rows, 1)

        # load spectrum
        dir_spectrum = os.path.join(self._dir_root_set, "Dataset", subset,
                                    "Spectra")
        files_spectrum = retrieve_files(dir_spectrum)
        Z = pd.read_csv(files_spectrum[idx], header=None).to_numpy()
        # set plot title
        if self._feature["feature"] == "Mfcc":
            title = "MFCC (%d bins)" % self._feature["mfc_coefficients"]
        elif self._feature["feature"] == "Stft":
            title = "Spectrogram (%d frequency bins)" % self._feature[
                "frequency_bins"]
        else:
            title = "%s-spectrogram (%d frequency bins)" % (
                self._feature["feature"],
                self._feature["frequency_bins"],
            )
        # plot spectrum
        ax = fig.add_subplot(gs[0])
        ax.set_title(title)
        ax = ph.plot_spectrum(Z, self._feature)

        # load states
        dir_states = os.path.join(self._dir_root_set, "Dataset", subset,
                                  "States")
        files_states = retrieve_files(dir_states)
        S = pd.read_csv(files_states[idx], header=None).to_numpy()
        # plot relevant states
        colors = ["orangered", "darkolivegreen", "steelblue", "goldenrod"]
        for i, state_name in enumerate(states):
            ax = fig.add_subplot(gs[1 + i])
            ax = ph.plot_states_synchronized(S, state_name, self._feature,
                                             colors)

        plt.show()
        return fig
    def create_mixed_test_set(self, noise_ratio, seed=42, overwrite=False):
        # make state dir. if it does not exist (do not overwrite, ever)
        dir_out = os.path.join(self._dir_root_ac, "Features", "Mixed",
                               "States", "Test")
        if not os.path.exists(dir_out):
            os.makedirs(dir_out)

        # set output directory (spectra)
        dir_out = os.path.join(
            self._dir_root_ac,
            "Features",
            "Mixed",
            "Spectra",
            "Ratio_%.2f" % noise_ratio,
            "Test",
        )
        # check if it exists or should be overwritten
        if os.path.exists(dir_out) and not overwrite:
            return
        refresh_directory(dir_out)

        # retrieve 'clean' spectra (ESC-50)
        dir_clean = os.path.join(self._dir_root_ac, "Features", "Clean",
                                 "Spectra", "Test")
        files_clean = retrieve_files(dir_clean)
        # retrieve ego-noise spectra (MAV)
        dir_noise = os.path.join(self._dir_root_enp, "Dataset", "Test",
                                 "Spectra")
        files_noise = retrieve_files(dir_noise)
        # retrieve states belonging to ego-noise
        dir_states = os.path.join(self._dir_root_enp, "Dataset", "Test",
                                  "States")
        files_states = retrieve_files(dir_states)

        # generate a list of directory-specific 'seeds' from the given seed
        # to preserve reproducible randomness while multiprocessing
        dir_seed = seed + len("Test")
        np.random.seed(dir_seed)
        seeds = np.random.randint(0, 10 * len(files_clean), len(files_clean))

        # set up multiprocessing to mix audio
        part = partial(
            self._create_mixed_spectrum,
            files_noise=files_noise,
            files_states=files_states,
            ratio=noise_ratio,
            subset="Test",
        )
        with Pool(processes=os.cpu_count() - 1) as pool:
            pool.starmap(part, list(zip(files_clean, seeds)))
    def create_clean_dataset(self, augmentations=None, overwrite=False):
        """Create (extract and export) the clean dataset from the audio data.

        Keyword argument:
            overwrite -- whether to overwrite existing data (default: False).
        """
        # get list of dataset directory names
        if augmentations is not None:
            subsets = ["Train", "Val", "Test"]
            subsets += ["Train " + aug for aug in augmentations]
        else:
            # extract all sets available
            subsets = sorted(os.listdir(self._dir_root_audio))

        for subset in subsets:
            # output directory (spectra)
            dir_output = os.path.join(self._dir_root_ac, "Features", "Clean",
                                      "Spectra", subset)
            # only extract feature if set does not exist or should be overwritten
            if os.path.exists(dir_output) and not overwrite:
                continue
            refresh_directory(dir_output)

            # get audio files (ESC-50)
            dir_input = os.path.join(self._dir_root_audio, subset)
            files_input = retrieve_files(dir_input)

            # multiprocessing
            part = partial(self._export_spectrum, dir_out=dir_output)
            with Pool(processes=os.cpu_count() - 1) as pool:
                pool.map(part, files_input)
    def extract_spectra(self, offset=50, scaling=80):
        # Loop over subsets
        for subset in ["Train", "Val", "Test"]:
            # Get audio files
            dir_audio = os.path.join(self._dir_root, "Ego-Noise Prediction",
                                     "Dataset", subset, "Audio")
            files_audio = retrieve_files(dir_audio)
            # directory for the unsynchronized spectra
            dir_output = os.path.join(self._dir_root_set, "Unsynchronized",
                                      subset, "Spectra")
            # Refresh directory
            refresh_directory(dir_output)

            # Loop through files in set
            for f in files_audio:
                # Extract spectrum
                Z = fh.extract_spectrum(f, self._feature)
                # Scale spectrum
                Z += offset
                Z /= scaling
                # Save to appropriate directory
                fn = os.path.split(f)[-1].replace(".wav", ".csv")
                fp = os.path.join(dir_output, fn)
                pd.DataFrame(Z).to_csv(fp, index=False, header=False)

                print_verbose(
                    self.super_verbose,
                    "Finished extracting feature for '%s' set." % subset,
                )
Exemplo n.º 6
0
    def classify_mismatched_test_set(self, model):
        # load spectra
        dir_spectra = os.path.join(self._dir_root_set, "Dataset", "Test", "Spectra")
        files_spectra = retrieve_files(dir_spectra)

        # load data into torch set, dump info into df
        if not hasattr(self, "_states"):
            dataset, df = self._load_dataset(files_spectra)
        else:
            # load states
            dir_states = os.path.join(self._dir_root_set, "Dataset", "Test", "States")
            files_states = retrieve_files(dir_states)
            dataset, df = self._load_dataset(files_spectra, files_states)

        # predict data
        df = self._classify_set(model, dataset, df)

        return df
Exemplo n.º 7
0
    def save_network_output(self, model, dir_model, subset, plot=True):

        # refresh the output directories
        output_subdirs = ["Original", "Predicted", "Residual"]
        for subdir in output_subdirs:
            refresh_directory(os.path.join(dir_model, "Output", subset,
                                           subdir))

        # load the original files (states, spectra) in the subset
        dir_states = os.path.join(self._dir_root_set, "Dataset", subset,
                                  "States")
        files_states = retrieve_files(dir_states)
        dir_spectra = os.path.join(self._dir_root_set, "Dataset", subset,
                                   "Spectra")
        files_spectra = retrieve_files(dir_spectra)

        for i in range(len(files_states)):
            # load original spectra and cut-off context
            original = pd.read_csv(files_spectra[i], header=None).to_numpy()
            if self._states["context_frames"] > 0:
                original = original[:, self._states["context_frames"]:]
            # predict spectra from states file
            predicted = self._predict(model, files_states[i], original.shape)
            # compute residual
            residual = original - predicted

            # plot if desired
            if plot:
                self._plot_model_output(original, predicted, residual)

            # save output
            fn = os.path.split(files_states[i])[-1]  # target filename
            output_spectra = [original, predicted, residual]
            for spectrum, subdir in zip(output_spectra, output_subdirs):
                # save spectrum
                dir_out = os.path.join(dir_model, "Output", subset, subdir)
                pd.DataFrame(spectrum).to_csv(os.path.join(dir_out, fn),
                                              index=False,
                                              header=False)
Exemplo n.º 8
0
    def _load_data(self, dir_split):

        # load N files
        files_X = retrieve_files(os.path.join(dir_split, "States"))  # input
        files_Y = retrieve_files(os.path.join(dir_split, "Spectra"))  # output

        # load states: NxTxS
        data_X = [
            pd.read_csv(f, header=None).to_numpy().transpose() for f in files_X
        ]
        # extract only relevant states
        data_X = [
            fh.extract_relevant_states(data, self._states["states"])
            for data in data_X
        ]
        # load spectra: NxTxF
        data_Y = [
            pd.read_csv(f, header=None).to_numpy().transpose() for f in files_Y
        ]

        if self._states["context_frames"] > 0:
            # add context to the dataset: (NxTxS, NxTxF) -> (NxT-CxCxS, NxT-CxCxF)
            data_X, data_Y = list(
                zip(*[
                    self._add_context(dX, dY)
                    for dX, dY in zip(data_X, data_Y)
                ]))
        else:
            # add placeholder dim. for X: NxTxS -> NxTx1xS
            data_X = [np.expand_dims(X, 1) for X in data_X]

        # concatenate N and T axes to get 3D set
        data_X = np.concatenate(data_X, axis=0)
        data_Y = np.concatenate(data_Y, axis=0)
        # convert to torch dataset
        X = torch.from_numpy(data_X).float()
        Y = torch.from_numpy(data_Y).float()
        dataset = torch.utils.data.TensorDataset(X, Y)
        return dataset
    def synchronize_data(self, skip_takeoff=True):

        # Loop over subsets
        for subset in ["Train", "Val", "Test"]:
            # list unsynchronized spectra
            dir_spectra = os.path.join(self._dir_root_set, "Unsynchronized",
                                       subset, "Spectra")
            files_spectra = retrieve_files(dir_spectra)
            # list unsynchronized states
            dir_states = os.path.join(self._dir_root_set, "Unsynchronized",
                                      subset, "States")
            files_states = retrieve_files(dir_states)

            # set the root output directory and refresh the output directories
            dir_root_output = os.path.join(self._dir_root_set, "Dataset",
                                           subset)
            refresh_directory(os.path.join(dir_root_output, "Spectra"))
            refresh_directory(os.path.join(dir_root_output, "States"))

            # synchronize each pair of files
            for i in range(len(files_spectra)):
                self._synchronize_pair(files_spectra[i], files_states[i],
                                       dir_root_output, skip_takeoff)
    def plot_spectra_denoised(
        self,
        noise_ratio,
        categories=None,
        idx=0,
        plot_clean=True,
        plot_noise=True,
        plot_mixed=True,
        plot_predicted=True,
        enp_model_index=0,
    ):
        """Plot a selection of spectra of a denoised test set.

        Keyword arguments:
            set_name -- set to be plotted (e.g. 'Test'),
            noise_ratio -- the ratio of the noise compared to the signal,
            categories -- iterable containing the original categories
            (airplane, engine, etc.) to be plotted (default: all),
            features -- iterable containing the features to be plotted
            (default: all),
            idx -- index of the feature selected for plotting (default: 0).
            plot_clean -- whether to plot the clean spectra (default: True),
            plot_noise -- whether to plot the noise-only spectra
            (default: True)
            plot_mixed -- whether to plot the mixed-only spectra
            (default: True)
            plot_predicted: whether to plot the predicted ego-noise spectra
            (default: True)
            enp_model_index -- selects which ego-noise predictor (enp) to use
            for the prediction, if multiple are available (default: 0),
            colorbar -- whether to plot a colorbar next to each plotted
            spectrum (default: False)
        Example usage:
            plot_spectra_denoised(1.0, ['airplane', 'helicopter'], ['Stft'],
                                  plot_noise=False, plot_predicted=False)
            This plots the clean, mixed and denoised spectra of the first
            spectrogram belonging to the airplane and helicopter categories
            within the test set with a noise ratio of 1.00.
        """
        # define directories
        dir_denoised = os.path.join(
            self._dir_root_ac,
            "Features",
            "Denoised",
            "Spectra",
            "Ratio_%.2f" % noise_ratio,
            "Test",
        )
        if plot_clean:
            dir_clean = os.path.join(
                self._dir_root_ac,
                "Features",
                "Clean",
                "Spectra",
                "Test",
            )
        if plot_noise:
            # load noise files
            dir_noise = os.path.join(
                self._dir_root_enp,
                "Dataset",
                "Test",
                "Spectra",
            )
            files_noise = retrieve_files(dir_noise)
        if plot_mixed:
            dir_mixed = os.path.join(
                self._dir_root_ac,
                "Features",
                "Mixed",
                "Spectra",
                "Ratio_%.2f" % noise_ratio,
                "Test",
            )
        if plot_predicted:
            # select appropriate model
            dir_model_enp = sorted(
                os.listdir(os.path.join(self._dir_root_enp,
                                        "Models")))[enp_model_index]
            dir_predicted = os.path.join(
                self._dir_root_enp,
                "Models",
                dir_model_enp,
                "Output",
                "Test",
                "Predicted",
            )
            files_predicted = retrieve_files(dir_predicted)
            # get model context for offset
            context = int(dir_model_enp.split("_")[-1][-1])
        if plot_noise or plot_predicted:
            # also load states files to recover noise indices
            dir_states = os.path.join(
                self._dir_root_ac,
                "Features",
                "Mixed",
                "States",
                "Test",
            )
            files_states = retrieve_files(dir_states)

        # plot all categories if not given
        filenames = [f for f in sorted(os.listdir(dir_mixed))]
        if categories is None:
            file_categories = [f.split("_")[0] for f in filenames]
            categories = sorted(list(set(file_categories)))

        # setup figure, subfigures
        n_categories = len(categories)
        n_variants = (1 + int(plot_clean) + int(plot_noise) + int(plot_mixed) +
                      int(plot_predicted))
        fig = plt.figure(
            figsize=(6 * n_categories, 4 * n_variants),
            constrained_layout=True,
        )
        gs = fig.add_gridspec(n_variants * 2, n_categories)

        # loop through categories
        for i, cat in enumerate(categories):
            # get filename
            fn = [f for f in filenames if f.split("_")[0] == cat][idx]
            # load denoised file
            file_denoised = os.path.join(dir_denoised, fn)
            D = pd.read_csv(file_denoised, header=None).to_numpy()

            # store spectra and titles
            spectra = []
            titles = []

            if plot_clean:
                # load clean file
                file_clean = os.path.join(dir_clean, fn)
                C = pd.read_csv(file_clean, header=None).to_numpy()
                # add to lists
                spectra.append(C)
                titles.append("'%s': Clean Sound (%s)" %
                              (fn, self._feature["feature"]))

            if plot_noise or plot_predicted:
                # get filename of states file belonging to noise file
                fn_states = os.path.split([
                    f for f in files_states if fn.replace(".csv", "") in f
                ][0])[-1]
                # get file idx, frame idx from states filename for noise file
                idx_file = int(fn_states.split("_")[-2])
                idx_frame = int(fn_states.split("_")[-1].split(".")[0])

            if plot_noise:
                # load noise file from idx_file
                file_noise = files_noise[idx_file]
                # get noise fragment from idx_frame
                N = pd.read_csv(file_noise,
                                header=None).to_numpy()[:,
                                                        idx_frame:idx_frame +
                                                        D.shape[1]]
                # add deltas to noise fragment
                N = np.concatenate(
                    (N, librosa.feature.delta(N, mode="mirror")), axis=0)
                # add to lists
                spectra.append(N)
                titles.append("MAV Noise (%s)" % self._feature["feature"])

            if plot_mixed:
                # load mixed file
                file_mixed = os.path.join(dir_mixed, fn)
                M = pd.read_csv(file_mixed, header=None).to_numpy()
                # add to lists
                spectra.append(M)
                titles.append("'%s': Noisy Mix (%s)" %
                              (fn, self._feature["feature"]))

            if plot_predicted:
                # load predicted file
                file_pred = files_predicted[idx_file]
                P = pd.read_csv(file_pred,
                                header=None).to_numpy()[:, idx_frame -
                                                        context:idx_frame -
                                                        context + D.shape[1], ]
                # add deltas to predicted fragment
                P = np.concatenate(
                    (P, librosa.feature.delta(P, mode="mirror")), axis=0)
                spectra.append(P)
                titles.append("Predicted MAV Noise (%s)" %
                              self._feature["feature"])

            # add denoised file to end of lists
            spectra.append(D)
            titles.append("'%s': Denoised Sound (%s)" %
                          (fn, self._feature["feature"]))

            for j, Z in enumerate(spectra):
                # plot spectrum
                ax = fig.add_subplot(gs[2 * j, i])
                ph.plot_spectrum(Z[:Z.shape[0] // 2], self._feature)
                ax.set_title(titles[j])
                # plot delta-spectrum
                ax = fig.add_subplot(gs[2 * j + 1, i])
                ph.plot_spectrum(Z[Z.shape[0] // 2:],
                                 self._feature,
                                 colormap="coolwarm")
    def plot_spectra_mixed(
        self,
        set_name,
        noise_ratio,
        categories=None,
        idx=0,
        plot_clean=True,
        plot_noise=True,
    ):
        """Plot a selection of spectra of a mixed dataset.

        Keyword arguments:
            set_name -- set to be plotted (e.g. 'Test'),
            noise_ratio -- the ratio of the noise compared to the signal,
            categories -- iterable containing the original categories
            (airplane, engine, etc.) to be plotted (default: all),
            features -- iterable containing the features to be plotted
            (default: all),
            idx -- index of the feature selected for plotting (default: 0).
            plot_clean -- whether to plot the clean spectra (default: True),
            plot_noise -- whether to plot the noise-only spectra
            (default: True)
            colorbar -- whether to plot a colorbar next to each plotted
            spectrum (default: False)
        Example usage:
            plot_spectra_mixed('Train', )...

        """
        # define directories
        dir_mixed = os.path.join(
            self._dir_root_ac,
            "Features",
            "Mixed",
            "Spectra",
            "Ratio_%.2f" % noise_ratio,
            set_name,
        )
        if plot_clean:
            dir_clean = os.path.join(self._dir_root_ac, "Features", "Clean",
                                     "Spectra", set_name)
        if plot_noise:
            # load noise files
            dir_noise = os.path.join(self._dir_root_enp, "Dataset",
                                     set_name.split(" ")[0], "Spectra")
            files_noise = retrieve_files(dir_noise)
            # also load states files to recover noise indices
            dir_states = os.path.join(self._dir_root_ac, "Features", "Mixed",
                                      "States", set_name)
            files_states = retrieve_files(dir_states)

        # plot all categories if not given
        filenames = [f for f in sorted(os.listdir(dir_mixed))]
        if categories is None:
            file_categories = [f.split("_")[0] for f in filenames]
            categories = sorted(list(set(file_categories)))

        # setup figure, subfigures
        n_categories = len(categories)
        n_variants = 1 + int(plot_clean) + int(plot_noise)
        fig = plt.figure(
            figsize=(6 * n_categories, 4 * n_variants),
            constrained_layout=False,
        )
        gs = fig.add_gridspec(n_variants * 2, n_categories)

        # loop through categories
        for i, cat in enumerate(categories):
            # get filename
            fn = [f for f in filenames if f.split("_")[0] == cat][idx]
            # load mixed file
            file_mixed = os.path.join(dir_mixed, fn)
            M = pd.read_csv(file_mixed, header=None).to_numpy()

            # store spectra and titles
            spectra = []
            titles = []

            if plot_clean:
                # load clean file
                file_clean = os.path.join(dir_clean, fn)
                C = pd.read_csv(file_clean, header=None).to_numpy()
                # add to lists
                spectra.append(C)
                titles.append("'%s': Clean Sound (%s)" %
                              (fn, self._feature["feature"]))

            if plot_noise:
                # get filename of states file belonging to noise file
                fn_states = os.path.split([
                    f for f in files_states if fn.replace(".csv", "") in f
                ][0])[-1]
                # get file idx, frame idx from states filename for noise file
                idx_file = int(fn_states.split("_")[-2])
                idx_frame = int(
                    fn_states.split("_")[-1].split(".")[0])  # omit .csv
                # load noise file from idx_file
                file_noise = files_noise[idx_file]
                # get noise fragment from idx_frame
                N = pd.read_csv(file_noise,
                                header=None).to_numpy()[:,
                                                        idx_frame:idx_frame +
                                                        M.shape[1]]
                # add deltas to noise fragment
                N = np.concatenate(
                    (N, librosa.feature.delta(N, mode="mirror")), axis=0)
                # add to lists
                spectra.append(N)
                titles.append("MAV Noise (%s)" % self._feature["feature"])

            # add mixed file to end of lists
            spectra.append(M)
            titles.append("'%s': Noisy Mix (%s)" %
                          (fn, self._feature["feature"]))

            for j, Z in enumerate(spectra):
                # plot spectrum
                ax = fig.add_subplot(gs[2 * j, i])
                ph.plot_spectrum(Z[:Z.shape[0] // 2], self._feature)
                ax.set_title(titles[j])
                # plot delta-spectrum
                ax = fig.add_subplot(gs[2 * j + 1, i])
                ph.plot_spectrum(Z[Z.shape[0] // 2:],
                                 self._feature,
                                 colormap="coolwarm")
    def create_denoised_train_augmentation_set(self,
                                               noise_ratio,
                                               enp_model_index=0,
                                               overwrite=False):
        """Denoise the mixed spectra to obtain a single denoised training set.

        Keyword arguments:
            noise_ratio -- the ratio of the noise compared to the signal
            (default: 1.0),
            enp_model_index -- selects which ego-noise predictor (enp) to use
            for denoising, if multiple are available (default: 0),
            overwrite -- whether to overwrite existing data (default: False).
        """
        # set output directory
        dir_out = os.path.join(
            self._dir_root_ac,
            "Features",
            "Clean",
            "Spectra",
            "Train Denoised",
        )
        # check if it exists or should be overwritten
        if os.path.exists(dir_out) and not overwrite:
            return  # exit
        refresh_directory(dir_out)

        # get directory containing noise model
        dir_model_enp = sorted(
            os.listdir(os.path.join(self._dir_root_enp,
                                    "Models")))[enp_model_index]
        # get context frames !!! get from config instead...
        model_context = int(dir_model_enp.split("_")[-1][-1])
        # load files containing predicted noise
        dir_predicted = os.path.join(
            self._dir_root_enp,
            "Models",
            dir_model_enp,
            "Output",
            "Train",
            "Predicted",
        )
        files_predicted = retrieve_files(dir_predicted)

        # load states files
        dir_states = os.path.join(self._dir_root_ac, "Features", "Mixed",
                                  "States", "Train")
        files_states = retrieve_files(dir_states)

        # load mixed files
        dir_mix = os.path.join(
            self._dir_root_ac,
            "Features",
            "Mixed",
            "Spectra",
            "Ratio_%.2f" % noise_ratio,
            "Train",
        )
        files_mix = retrieve_files(dir_mix)

        # set up pool
        part = partial(
            self._create_denoised_feature,
            files_predicted=files_predicted,
            files_states=files_states,
            context=model_context,
            ratio=noise_ratio,
            dir_out=dir_out,
        )
        with Pool(processes=os.cpu_count() - 1) as pool:
            pool.map(part, files_mix)

        # add extension to filenames to match other augmentations
        for fn in sorted(os.listdir(dir_out)):
            fn_new = "%s-dn.csv" % fn.split(".")[0]
            os.rename(os.path.join(dir_out, fn), os.path.join(dir_out, fn_new))
    def create_denoised_test_set(self,
                                 noise_ratio,
                                 enp_model_index=0,
                                 overwrite=False):
        """Denoise the mixed spectra to obtain denoised test sets.

        Keyword arguments:
            noise_ratios -- iterable containing the ratio of the noise
            compared to the signal (default: [0.5, 1.0]),
            enp_model_index -- selects which ego-noise predictor (enp) to use
            for denoising, if multiple are available (default: 0),
            overwrite -- whether to overwrite existing data (default: False).

        Note: only noise ratios that were used for the mixing of noisy data
        (via create_noisy_dataset) can be used for denoising.
        """
        # set output directory
        dir_out = os.path.join(
            self._dir_root_ac,
            "Features",
            "Denoised",
            "Spectra",
            "Ratio_%.2f" % noise_ratio,
            "Test",
        )
        # check if it exists or should be overwritten
        if os.path.exists(dir_out) and not overwrite:
            return
        refresh_directory(dir_out)

        # get directory containing noise model
        dir_model_enp = sorted(
            os.listdir(os.path.join(self._dir_root_enp,
                                    "Models")))[enp_model_index]
        # get context frames !!! get from config instead...
        model_context = int(dir_model_enp.split("_")[-1][-1])
        # load files containing predicted noise
        dir_predicted = os.path.join(
            self._dir_root_enp,
            "Models",
            dir_model_enp,
            "Output",
            "Test",
            "Predicted",
        )
        files_predicted = retrieve_files(dir_predicted)

        # load states files
        dir_states = os.path.join(self._dir_root_ac, "Features", "Mixed",
                                  "States", "Test")
        files_states = retrieve_files(dir_states)

        # load mixed files
        dir_mix = os.path.join(
            self._dir_root_ac,
            "Features",
            "Mixed",
            "Spectra",
            "Ratio_%.2f" % noise_ratio,
            "Test",
        )
        files_mix = retrieve_files(dir_mix)

        # set up pool
        part = partial(
            self._create_denoised_feature,
            files_predicted=files_predicted,
            files_states=files_states,
            context=model_context,
            ratio=noise_ratio,
            dir_out=dir_out,
        )
        with Pool(processes=os.cpu_count() - 1) as pool:
            pool.map(part, files_mix)
    def split_dataset(self,
                      train_test_ratio=0.8,
                      train_val_ratio=0.8,
                      overwrite=False):
        """Split the dataset into a training, validation and test subset.

        Keyword arguments:
            train_test_ratio -- ratio of the training set over the complete,
            set, the remainder will be assigned to the test subset
            (default: 0.8),
            train_val_ratio -- ratio of the actual training set over the
            training set, the remainder will be assigned to the validation
            subset (default: 0.8),
            overwrite -- whether to overwrite existing data (default: False).
        """
        # directories
        dir_input = os.path.join(self._dir_root, "Aircraft Classification",
                                 "Audio", "Full")
        dir_root_output = os.path.join(self._dir_root,
                                       "Aircraft Classification", "Audio")
        # check if data should be overwritten if it exists
        if os.path.exists(os.path.join(dir_root_output,
                                       "Train")) and not overwrite:
            print_verbose(
                self.verbose,
                "Dataset already exists and should not be overwritten.")
            return
        # refresh the output directories
        subdirs = ["Train", "Val", "Test"]
        for subdir in subdirs:
            refresh_directory(os.path.join(dir_root_output, subdir))

        # read files into array for easy slicing
        files = np.array(retrieve_files(dir_input))
        # get categories
        file_categories = np.array(
            [os.path.split(f)[-1].split("_")[0] for f in files])
        categories = np.unique(file_categories)
        files_per_category = len(files) // len(categories)

        # get train, val, test indices per category
        train_idcs, test_idcs = train_test_split(np.arange(files_per_category),
                                                 train_size=train_test_ratio,
                                                 random_state=42)
        train_idcs, val_idcs = train_test_split(train_idcs,
                                                train_size=train_val_ratio,
                                                random_state=42)
        print_verbose(
            self.verbose,
            "Split per category (Train, Val, Test): (%d, %d, %d)" %
            (len(train_idcs), len(val_idcs), len(test_idcs)),
        )

        # extract  train, val, test files using indices and export to subdirs
        indices = [train_idcs, val_idcs, test_idcs]
        for idcs, subdir in zip(indices, subdirs):
            files_set = [
                f for f in files
                if int(os.path.split(f)[-1].split("_")[-1].split(".")[0]) -
                1 in idcs
            ]
            for file in files_set:
                dest = os.path.join(dir_root_output, subdir,
                                    os.path.split(file)[-1])
                shutil.copyfile(file, dest)

        # remove the now redundant 'Full' input directory
        shutil.rmtree(dir_input)
    def create_mixed_dataset(self,
                             noise_ratio,
                             augmentations=None,
                             seed=42,
                             overwrite=False):
        """Mix clean spectra with ego-noise spectra at the specified ratios.

        Keyword arguments:
            noise_ratios -- iterable containing the ratio of the noise
            compared to the signal (default: [0.5, 1.0]),
            overwrite -- whether to overwrite existing data (default: False).
        Note that the mixing is entirely random. When generating new features
        (i.e. deciding to extract both 'Mel' and 'Stft' features instead of
        only 'Stft'), it is recommended to set overwrite to True to ensure
        mixing consistency between existing and new features.
        """
        # get list of dataset directory names
        if augmentations is not None:
            subsets = ["Train", "Val", "Test"]
            subsets += ["Train " + aug for aug in augmentations]
        else:
            # extract all sets available
            subsets = os.listdir(self._dir_root_audio)

        # loop through sets in dataset
        for subset in subsets:
            # retrieve 'clean' spectra (ESC-50)
            dir_clean = os.path.join(self._dir_root_ac, "Features", "Clean",
                                     "Spectra", subset)
            files_clean = retrieve_files(dir_clean)
            # retrieve ego-noise spectra (MAV)
            subset_enp = subset.split(" ")[
                0]  # use 'Train' set for augmentations
            dir_noise = os.path.join(self._dir_root_enp, "Dataset", subset_enp,
                                     "Spectra")
            files_noise = retrieve_files(dir_noise)
            # retrieve states belonging to ego-noise
            dir_states = os.path.join(self._dir_root_enp, "Dataset",
                                      subset_enp, "States")
            files_states = retrieve_files(dir_states)

            # generate a list of directory-specific 'seeds' from the given seed
            # to preserve reproducible randomness while multiprocessing
            dir_seed = seed + len(subset)
            np.random.seed(dir_seed)
            seeds = np.random.randint(0, 10 * len(files_clean),
                                      len(files_clean))

            # make state dir. if it does not exist (do not overwrite, ever)
            dir_out = os.path.join(self._dir_root_ac, "Features", "Mixed",
                                   "States", subset)
            if not os.path.exists(dir_out):
                os.makedirs(dir_out)

            # set output directory (spectra)
            dir_out = os.path.join(
                self._dir_root_ac,
                "Features",
                "Mixed",
                "Spectra",
                "Ratio_%.2f" % noise_ratio,
                subset,
            )
            # check if it exists or should be overwritten
            if os.path.exists(dir_out) and not overwrite:
                continue  # skip set
            refresh_directory(dir_out)

            # set up multiprocessing to mix audio
            part = partial(
                self._create_mixed_spectrum,
                files_noise=files_noise,
                files_states=files_states,
                ratio=noise_ratio,
                subset=subset,
            )
            with Pool(processes=os.cpu_count() - 1) as pool:
                pool.starmap(part, list(zip(files_clean, seeds)))
    def extract_states(self):
        # Loop over subsets
        for subset in ["Train", "Val", "Test"]:
            # Get states files
            dir_states = os.path.join(self._dir_root, "Ego-Noise Prediction",
                                      "Dataset", subset, "States")
            files_states = retrieve_files(dir_states)
            # Directory for the unsynchronized states
            dir_output = os.path.join(self._dir_root_set, "Unsynchronized",
                                      subset, "States")
            refresh_directory(dir_output)

            # Loop through files in set
            for f in files_states:  # xyz in NED frame
                # Read in as dataframe
                df = pd.read_csv(f, header=0)
                # Add delta-rpm
                df["rpm_1_delta"] = np.diff(df["rpm_1"].to_numpy(), prepend=0)
                df["rpm_2_delta"] = np.diff(df["rpm_2"].to_numpy(), prepend=0)
                df["rpm_3_delta"] = np.diff(df["rpm_3"].to_numpy(), prepend=0)
                df["rpm_4_delta"] = np.diff(df["rpm_4"].to_numpy(), prepend=0)
                # Add delta-cmd
                df["cmd_thrust_delta"] = np.diff(df["cmd_thrust"].to_numpy(),
                                                 prepend=0)
                df["cmd_roll_delta"] = np.diff(df["cmd_roll"].to_numpy(),
                                               prepend=0)
                df["cmd_pitch_delta"] = np.diff(df["cmd_pitch"].to_numpy(),
                                                prepend=0)
                df["cmd_yaw_delta"] = np.diff(df["cmd_yaw"].to_numpy(),
                                              prepend=0)
                # Prune horizontal position
                df.drop(columns=["pos_x", "pos_y"], inplace=True)
                # Negate vertical position to get height
                df.rename(columns={"pos_z": "height"}, inplace=True)
                df["height"] *= -1
                # Replace north- and east velocities with magnitude (horizontal)
                df["vel_hor"] = np.sqrt(df["vel_x"]**2 + df["vel_y"]**2)
                df.drop(columns=["vel_x", "vel_y"], inplace=True)
                # Negate downwards velocity go get vertical velocity
                df.rename(columns={"vel_z": "vel_ver"}, inplace=True)
                df["vel_ver"] *= -1
                # Replace north- and east accelerations with magnitude (horizontal)
                df["acc_hor"] = np.sqrt(df["acc_x"]**2 + df["acc_y"]**2)
                df.drop(columns=["acc_x", "acc_y"], inplace=True)
                # Negate downwards velocity go get vertical acceleration
                df.rename(columns={"acc_z": "acc_ver"}, inplace=True)
                df["acc_ver"] *= -1
                # Re-order the frame
                cols = [
                    "delta_t",
                    "rpm_1",
                    "rpm_2",
                    "rpm_3",
                    "rpm_4",
                    "rpm_1_delta",
                    "rpm_2_delta",
                    "rpm_3_delta",
                    "rpm_4_delta",
                    "cmd_thrust",
                    "cmd_roll",
                    "cmd_pitch",
                    "cmd_yaw",
                    "cmd_thrust_delta",
                    "cmd_roll_delta",
                    "cmd_pitch_delta",
                    "cmd_yaw_delta",
                    "height",
                    "vel_hor",
                    "vel_ver",
                    "acc_hor",
                    "acc_ver",
                    "angle_phi",
                    "angle_theta",
                    "angle_psi",
                    "rate_p",
                    "rate_q",
                    "rate_r",
                ]
                df = df[cols]
                # Export
                fn = os.path.split(f)[-1]
                df.to_csv(os.path.join(dir_output, fn),
                          header=True,
                          index=False)
Exemplo n.º 17
0
    def split_features(
        self, subset=None, augmentations=None, noise_set=None, noise_ratio=None
    ):
        # default 'noise' is no noise (clean)
        if noise_set is None:
            noise_set = "Clean"

        if subset is not None:
            if type(subset) == str:
                subset = [subset]
            subsets = subset
        else:
            # split 'Train', 'Val', 'Test' set if no specific subset is given
            if augmentations is not None:
                if type(augmentations) == str:
                    augmentations = [augmentations]
                # add specific augmentation(s) to default sets
                subsets = ["Train", "Val", "Test"]
                subsets += ["Train " + a for a in augmentations]
            else:
                # use all available augmentations
                if noise_set == "Clean":
                    subsets = os.listdir(
                        os.path.join(self._dir_root_set, "Features", "Clean", "Spectra")
                    )
                else:
                    subsets = os.listdir(
                        os.path.join(
                            self._dir_root_set,
                            "Features",
                            noise_set,
                            "Spectra",
                            "Ratio_%.2f" % noise_ratio,
                        )
                    )

        # root input directory spectra
        dir_root_spectra_in = os.path.join(
            self._dir_root_set, "Features", noise_set, "Spectra"
        )
        if noise_set != "Clean":
            dir_root_spectra_in = os.path.join(
                dir_root_spectra_in, "Ratio_%.2f" % noise_ratio
            )

        for subset in sorted(subsets):
            # load 5-second spectra belonging to categories
            dir_in_spectra = os.path.join(dir_root_spectra_in, subset)
            files_spectra = retrieve_files(dir_in_spectra)
            files_spectra = [
                f
                for f in files_spectra
                if os.path.split(f)[-1].split("_")[0]
                in self._classification["categories"]
            ]

            # set output directory (augmentations i.e. 'Train Denoised' go in 'Train')
            dir_out_spectra = os.path.join(
                self._dir_root_set, "Dataset", subset.split(" ")[0], "Spectra"
            )
            # refresh directories only for non-augmented sets
            if subset in ["Train", "Val", "Test"]:
                refresh_directory(dir_out_spectra)

            # split spectra
            part = partial(self._split_spectra, dir_output=dir_out_spectra)
            with Pool(processes=os.cpu_count() - 1) as pool:
                pool.map(part, files_spectra)

            # split the states in case of implicit denoising
            if hasattr(self, "_states"):
                # load states
                dir_in_states = os.path.join(
                    self._dir_root_set, "Features", "Mixed", "States", subset
                )
                files_states = retrieve_files(dir_in_states)
                files_states = [
                    f
                    for f in files_states
                    if os.path.split(f)[-1].split("_")[0]
                    in self._classification["categories"]
                ]

                # refresh output directory only for non-augmented sets

                dir_out_states = os.path.join(
                    self._dir_root_set, "Dataset", subset.split(" ")[0], "States"
                )
                if subset in ["Train", "Val", "Test"]:
                    refresh_directory(dir_out_states)

                # split states
                part = partial(self._split_states, dir_output=dir_out_states)
                with Pool(processes=os.cpu_count() - 1) as pool:
                    pool.map(part, files_states)

            print_verbose(
                self.verbose,
                "Split %d files (%d categories) into %d files"
                % (
                    len(files_spectra),
                    len(self._classification["categories"]),
                    len(os.listdir(dir_out_spectra)),
                ),
            )