Пример #1
0
    def run(self):
        """ Runs the original REPET algorithm

        Returns:
            background (AudioSignal): An AudioSignal object with repeating background in background.audio_data
            (to get the corresponding non-repeating foreground run self.make_audio_signals())

        Example:
             ::
            signal = nussl.AudioSignal(path_to_input_file='input_name.wav')

            # Set up and run Repet
            repet = nussl.Repet(signal)
            repet.run() # or repet()

            # Get audio signals
            background, foreground = repet.make_audio_signals()

            # output the background
            background.write_

        """
        # High pass filter cutoff freq. (in # of freq. bins), +1 to match MATLAB implementation
        self.high_pass_cutoff = int(np.ceil(self.high_pass_cutoff * (self.stft_params.n_fft_bins - 1) /
                                            self.audio_signal.sample_rate)) + 1

        # the MATLAB implementation had
        low = 1 if self.matlab_fidelity else 0

        self._compute_spectrum()
        self.repeating_period = self._calculate_repeating_period()

        # separate the mixture background by masking
        background_stft = []
        for i in range(self.audio_signal.num_channels):
            repeating_mask = self._compute_repeating_mask(self.magnitude_spectrogram[:, :, i])

            repeating_mask[low:self.high_pass_cutoff, :] = 1  # high-pass filter the foreground

            # apply mask
            stft_with_mask = repeating_mask * self.stft[:, :, i]
            background_stft.append(stft_with_mask)

        background_stft = np.array(background_stft).transpose((1, 2, 0))
        self.background = AudioSignal(stft=background_stft, sample_rate=self.audio_signal.sample_rate)
        self.background.istft(self.stft_params.window_length, self.stft_params.hop_length,
                              self.stft_params.window_type, overwrite=True,
                              use_librosa=self.use_librosa_stft)

        return self.background
Пример #2
0
def _make_audio_signal_from_stem(filename, stem, i, sr, label):
    """
    Reads in a :param:`stem` from the stempeg library (`stempeg.read_stems()`) and creates a
    correctly formatted :class:`AudioSignal` object (with all the metadata set up).
    Args:
        filename (str): Name of the file on disc.
        stem (:obj:`np.ndarray`): Numpy array from the `stempeg.read_stems()` function.
        i (int): Index of the :param:`stem: array to get audio data from.
        sr (int): Sample rate.
        label (str): Label for the :class:`AudioSignal` object.

    Returns:
        (:obj:`AudioSignal`) Correctly formatted :class:`AudioSignal` object with the
            right metadata.

    """
    signal = AudioSignal(audio_data_array=stem[i,...], sample_rate=sr)
    signal.path_to_input_file = filename
    signal.label = label
    return signal
Пример #3
0
class Repet(separation_base.SeparationBase):
    """Implements the original REpeating Pattern Extraction Technique algorithm using the beat spectrum.

    REPET is a simple method for separating a repeating background from a non-repeating foreground in an
    audio mixture. It assumes a single repeating period over the whole signal duration, and finds that
    period based on finding a peak in the beat spectrum. The period can also be provided exactly, or you
    can give ``Repet`` a guess of the min and max period. Once it has a period, it "overlays" spectrogram
    sections of length ``period`` to create a median model (the background).

    References:
        * Zafar Rafii and Bryan Pardo. "Audio Separation System and Method," US20130064379 A1, US 13/612,413, March 14,
          2013

    See Also:
        http://music.eecs.northwestern.edu/research.php?project=repet

    Parameters:
        input_audio_signal (:obj:`AudioSignal`): The ``AudioSignal`` object that REPET will be run on.
            This makes a copy of ``input_audio_signal``
        min_period (float, optional): minimum time to look for repeating period in terms of seconds.
        max_period (float, optional): maximum time to look for repeating period in terms of seconds.
        period (float, optional): exact time that the repeating period is (in seconds).
        high_pass_cutoff (float, optional): value (in Hz) for the high pass cutoff filter.
        do_mono (bool, optional): Flattens ``AudioSignal`` to mono before running the algorithm (does not effect the
                        input ``AudioSignal`` object)
        use_find_period_complex (bool, optional): Will use a more complex peak picker to find the repeating period
        use_librosa_stft (bool, optional): Calls librosa's stft function instead of nussl's
        matlab_fidelity (bool, optional): If True, does repet with the same settings as the original MATLAB
                        implementation of REPET, warts and all. This will override ``use_librosa_stft`` and set
                        it to ``False``.

    Examples:
        :ref:`The REPET Demo Example <repet_demo>`

    Attributes:
        background (:obj:`AudioSignal`): Calculated background. This is ``None`` until ``run()`` is called.
        foreground (:obj:`AudioSignal`): Calculated foreground. This is ``None`` until ``make_audio_signals()``
            is called.
        beat_spectrum (:obj:`np.array`): Beat spectrum calculated by Repet.

    """
    def __init__(self, input_audio_signal, min_period=None, max_period=None, period=None, high_pass_cutoff=None,
                 do_mono=False, use_find_period_complex=False, use_librosa_stft=config.USE_LIBROSA_STFT,
                 matlab_fidelity=False):
        super(Repet, self).__init__(input_audio_signal=input_audio_signal)
        self.high_pass_cutoff = 100.0 if high_pass_cutoff is None else float(high_pass_cutoff)
        self.background = None
        self.foreground = None
        self.beat_spectrum = None
        self.use_find_period_complex = use_find_period_complex
        self.use_librosa_stft = use_librosa_stft

        self.repeating_period = None
        self.magnitude_spectrogram = None
        self.stft = None
        self.repeating_period = None
        self.matlab_fidelity = matlab_fidelity
        self._is_period_converted_to_hops = False

        if self.matlab_fidelity:
            self.use_librosa_stft = False

        # TODO: stereo doesn't do true stereo REPET (see TODO below)
        if do_mono:
            self.audio_signal.to_mono(overwrite=True)

        if (min_period or max_period) and period:
            raise ValueError('Cannot set both period and (min_period or max_period)!')

        # Set period parameters
        self.min_period, self.max_period, self.period = None, None, None
        if period is None:
            self.min_period = 0.8 if min_period is None else min_period
            self.max_period = min(8, self.audio_signal.signal_duration / 3) if max_period is None else max_period
        else:
            self.period = period
            if not self._is_period_converted_to_hops:
                self.period = self._update_period(self.period)
                self._is_period_converted_to_hops = True


    def run(self):
        """ Runs the original REPET algorithm

        Returns:
            background (AudioSignal): An AudioSignal object with repeating background in background.audio_data
            (to get the corresponding non-repeating foreground run self.make_audio_signals())

        Example:
             ::
            signal = nussl.AudioSignal(path_to_input_file='input_name.wav')

            # Set up and run Repet
            repet = nussl.Repet(signal)
            repet.run() # or repet()

            # Get audio signals
            background, foreground = repet.make_audio_signals()

            # output the background
            background.write_

        """
        # High pass filter cutoff freq. (in # of freq. bins), +1 to match MATLAB implementation
        self.high_pass_cutoff = int(np.ceil(self.high_pass_cutoff * (self.stft_params.n_fft_bins - 1) /
                                            self.audio_signal.sample_rate)) + 1

        # the MATLAB implementation had
        low = 1 if self.matlab_fidelity else 0

        self._compute_spectrum()
        self.repeating_period = self._calculate_repeating_period()

        # separate the mixture background by masking
        background_stft = []
        for i in range(self.audio_signal.num_channels):
            repeating_mask = self._compute_repeating_mask(self.magnitude_spectrogram[:, :, i])

            repeating_mask[low:self.high_pass_cutoff, :] = 1  # high-pass filter the foreground

            # apply mask
            stft_with_mask = repeating_mask * self.stft[:, :, i]
            background_stft.append(stft_with_mask)

        background_stft = np.array(background_stft).transpose((1, 2, 0))
        self.background = AudioSignal(stft=background_stft, sample_rate=self.audio_signal.sample_rate)
        self.background.istft(self.stft_params.window_length, self.stft_params.hop_length,
                              self.stft_params.window_type, overwrite=True,
                              use_librosa=self.use_librosa_stft)

        return self.background

    def _compute_spectrum(self):
        self.stft = self.audio_signal.stft(overwrite=True, remove_reflection=True, use_librosa=self.use_librosa_stft)
        self.magnitude_spectrogram = np.abs(self.stft)

    def get_beat_spectrum(self, recompute_stft=False):
        """Calculates and returns the beat spectrum for the audio signal associated with this object

        Args:
            recompute_stft: (Optional) (bool) Recompute the stft for the audio signal

        Returns:
            beat_spectrum (np.array): beat spectrum for the audio file

        EXAMPLE::

            # Set up audio signal
            signal = nussl.AudioSignal('path_to_file.wav')

            # Set up a Repet object
            repet = nussl.Repet(signal)

            # I don't have to run repet to get a beat spectrum for signal
            beat_spec = repet.get_beat_spectrum()
        """
        if recompute_stft or self.magnitude_spectrogram is None:
            self._compute_spectrum()

        # TODO: Make this multi-channel. The np.mean() reduces the n channels to 1.
        self.beat_spectrum = self.compute_beat_spectrum(np.mean(np.square(self.magnitude_spectrogram),
                                                                axis=self.audio_signal._STFT_CHAN).T)
        return self.beat_spectrum

    def _calculate_repeating_period(self):
        # user provided a period, so no calculations to do
        if self.period is not None:
            return self.period

        # get beat spectrum
        self.beat_spectrum = self.get_beat_spectrum()

        if self.use_find_period_complex:
            self.repeating_period = self.find_repeating_period_complex(self.beat_spectrum)
        else:
            # update the min and max so they're in units of frequency bin indices
            if not self._is_period_converted_to_hops:
                self.min_period = self._update_period(self.min_period)
                self.max_period = self._update_period(self.max_period)
                self._is_period_converted_to_hops = True

            self.repeating_period = self.find_repeating_period_simple(self.beat_spectrum,
                                                                      self.min_period, self.max_period)

        return self.repeating_period

    @staticmethod
    def compute_beat_spectrum(power_spectrum):
        """Computes the beat spectrum averages (over freq's) the autocorrelation matrix of a one-sided spectrogram.

         The autocorrelation matrix is computed by taking the autocorrelation of each row of the spectrogram and
         dismissing the symmetric half.

        Parameters:
            power_spectrum (np.array): 2D matrix containing the one-sided power
            spectrogram of the audio signal (Lf by Lt by num channels)
        Returns:
            (np.array): array containing the beat spectrum based on the power spectrogram
        """
        freq_bins, time_bins = power_spectrum.shape

        # row-wise autocorrelation according to the Wiener-Khinchin theorem
        power_spectrum = np.vstack([power_spectrum, np.zeros_like(power_spectrum)])
        fft_power_spec = scifft.fft(power_spectrum, axis=0)
        abs_fft = np.abs(fft_power_spec) ** 2
        autocorrelation_rows = np.real(scifft.ifft(abs_fft, axis=0)[:freq_bins, :])  # ifft over columns

        # normalization factor
        norm_factor = np.tile(np.arange(freq_bins, 0, -1), (time_bins, 1)).T
        autocorrelation_rows = autocorrelation_rows / norm_factor

        # compute the beat spectrum
        beat_spectrum = np.mean(autocorrelation_rows, axis=1)  # average over frequencies

        return beat_spectrum

    @staticmethod
    def find_repeating_period_simple(beat_spectrum, min_period, max_period):
        """Computes the repeating period of the sound signal using the beat spectrum.
           This algorithm just looks for the max value in the interval [min_period, max_period] inclusive.
           It discards the first value, and returns the period in units of stft time bins.

        Parameters:
            beat_spectrum (np.array): input beat spectrum array
            min_period (int): minimum possible period value
            max_period (int): maximum possible period value
        Returns:
             period (int) : The period of the sound signal in stft time bins
        """
        min_period, max_period = int(min_period), int(max_period)
        beat_spectrum = beat_spectrum[1:]  # discard the first element of beat_spectrum (lag 0)
        beat_spectrum = beat_spectrum[min_period - 1: max_period]
        period = np.argmax(beat_spectrum) + min_period

        return period

    @staticmethod
    def find_repeating_period_complex(beat_spectrum):
        """

        Args:
            beat_spectrum:

        Returns:

        """
        auto_cosine = np.zeros((len(beat_spectrum), 1))

        for i in range(0, len(beat_spectrum) - 1):
            auto_cosine[i] = 1 - scipy.spatial.distance.cosine(beat_spectrum[0:len(beat_spectrum) - i],
                                                               beat_spectrum[i:len(beat_spectrum)])

        ac = auto_cosine[0:np.floor(auto_cosine.shape[0])/2]
        auto_cosine = np.vstack([ac[1], ac, ac[-2]])
        auto_cosine_diff = np.ediff1d(auto_cosine)
        sign_changes = auto_cosine_diff[0:-1]*auto_cosine_diff[1:]
        sign_changes = np.where(sign_changes < 0)[0]

        extrema_values = ac[sign_changes]

        e1 = np.insert(extrema_values, 0, extrema_values[0])
        e2 = np.insert(extrema_values, -1, extrema_values[-1])

        extrema_neighbors = np.stack((e1[0:-1], e2[1:]))

        m = np.amax(extrema_neighbors, axis=0)
        extrema_values = extrema_values.flatten()
        maxima = np.where(extrema_values >= m)[0]
        maxima = zip(sign_changes[maxima], extrema_values[maxima])
        maxima = maxima[1:]
        maxima = sorted(maxima, key=lambda x: -x[1])
        period = maxima[0][0]

        return period

    def _compute_repeating_mask(self, magnitude_spectrogram_channel):
        """Computes the soft mask for the repeating part using the magnitude spectrogram and the repeating period

        Parameters:
            magnitude_spectrogram_channel (np.array): 2D matrix containing the magnitude spectrogram of a signal
        Returns:
            M (np.array): 2D matrix (Lf by Lt) containing the soft mask for the repeating part, elements of M take on
            values in [0,1]

        """
        # this +1 is a kluge to make this implementation match the original MATLAB implementation
        period = self.repeating_period + 1
        freq_bins, time_bins = magnitude_spectrogram_channel.shape
        n_repetitions = int(np.ceil(float(time_bins) / period))
        one_period = freq_bins * period

        # Pad to make an integer number of repetitions. Pad with 'nan's to not affect the median.
        remainder = (period * n_repetitions) % time_bins
        mask_reshaped = np.hstack([magnitude_spectrogram_channel, float('nan') * np.zeros((freq_bins, remainder))])

        # reshape to take the median of each period
        mask_reshaped = np.reshape(mask_reshaped.T, (n_repetitions, one_period))

        # take median of repeating periods before and after the padding
        median_mask = np.nanmedian(mask_reshaped, axis=0)

        # reshape to it's original shape
        median_mask = np.reshape(np.tile(median_mask, (n_repetitions, 1)), (n_repetitions * period, freq_bins)).T
        median_mask = median_mask[:, :time_bins]

        # take minimum of computed mask and original input and scale
        min_median_mask = np.minimum(median_mask, magnitude_spectrogram_channel)
        mask = (min_median_mask + constants.EPSILON) / (magnitude_spectrogram_channel + constants.EPSILON)

        return mask

    def update_periods(self):
        """ Will update periods for use with ``self.find_repeating_period_simple()``.

        Updates from seconds to stft bin values.
        Call this if you haven't done ``self.run()`` or else you won't get good results.

        Examples:
            ::
            a = nussl.AudioSignal('path/to/file.wav')
            r = nussl.Repet(a)

            beat_spectrum = r.get_beat_spectrum()
            r.update_periods()
            repeating_period = r.find_repeating_period_simple(beat_spectrum, r.min_period, r.max_period)

        """
        if self._is_period_converted_to_hops:
            self.period = self._update_period(self.period) if self.period is not None else None
            self.min_period = self._update_period(self.min_period) if self.min_period is not None else None
            self.max_period = self._update_period(self.max_period) if self.max_period is not None else None
            self._is_period_converted_to_hops = True

    def _update_period(self, period):
        period = float(period)
        result = period * self.audio_signal.sample_rate
        result += self.stft_params.window_length / self.stft_params.window_overlap - 1
        result /= self.stft_params.window_overlap
        return int(np.ceil(result))

    def plot(self, output_file, **kwargs):
        """
        Creates a plot of the beat spectrum and outputs to output_file.


        Parameters:
            output_file (string) : string representing a path to the desired output file to be created.
            title: (string) Title to put on the plot
            show_repeating_period: (bool) if True, then adds a vertical line where repet things
                                the repeating period is (if the repeating period has been computed already)

        EXAMPLE:
             ::

            signal = nussl.AudioSignal('Sample.wav')
            repet = nussl.Repet(signal)

            repet.plot('new_beat_spec_plot.png', title="Beat Spectrum of Sample.wav", show_repeating_period=True)
        """
        import matplotlib.pyplot as plt
        plt.close('all')
        title = None
        show_repeating_period = False

        if len(kwargs) != 0:
            if 'title' in kwargs:
                title = kwargs['title']
            if 'show_repeating_period' in kwargs:
                show_repeating_period = kwargs['show_repeating_period']

        beat_spec = self.get_beat_spectrum()
        time_vect = np.linspace(0.0, self.audio_signal.signal_duration, num=len(beat_spec))
        plt.plot(time_vect, beat_spec)

        if self.repeating_period is not None and show_repeating_period:
            stft_vector = np.linspace(0.0, self.audio_signal.signal_duration, self.audio_signal.stft_length)
            rep = stft_vector[self.repeating_period]
            plt.plot((rep, rep), (0, np.max(beat_spec)), 'g--', label='Repeating period')
            # plt.plot((self.repeating_period, self.repeating_period), (-1e20, 1e20), 'g--')
            plt.ylim((0.0, np.max(beat_spec) * 1.1))

        title = title if title is not None else 'Beat Spectrum for {}'.format(self.audio_signal.file_name)
        plt.title(title)

        plt.xlabel('Time (s)')
        plt.ylabel('Beat Strength')
        plt.grid('on')

        plt.axis('tight')
        plt.savefig(output_file)

    def make_audio_signals(self):
        """ Returns the background and foreground audio signals. You must have run Repet.run() prior
        to calling this function. This function will return None if run() has not been called.

        Returns:
            Audio Signals (List): 2 element list.

                * bkgd: Audio signal with the calculated background track
                * fkgd: Audio signal with the calculated foreground track

        EXAMPLE:
             ::
            # set up AudioSignal object
            signal = nussl.AudioSignal('path_to_file.wav')

            # set up and run repet
            repet = nussl.Repet(signal)
            repet.run()

            # get audio signals (AudioSignal objects)
            background, foreground = repet.make_audio_signals()
        """
        if self.background is None:
            return None

        self.foreground = self.audio_signal - self.background
        self.foreground.sample_rate = self.audio_signal.sample_rate
        return [self.background, self.foreground]
Пример #4
0
def mir1k(directory, check_hash=True, subset=None, shuffle=False, seed=None, undivided=False):
    """
    Generator function for the MIR-1K data set. This allows you to loop through the entire data set
    with only a few :class:`AudioSignal` objects stored in memory at a time. There are options for
    only looping through a subset of the data set and shuffling the data set (with a seed). See
    details about those options below.

    `nussl` calculates the hash of the MIR-1K directory and compares it against a precomputed hash
    for MIR-1K that ships with `nussl`. This hash is used to verify that `nussl` understands the
    directory structure when reading the files. Calculating the hash can be turned off if the
    user needs a speed up, but this might cause oblique errors if the MIR-1K directory is not set up
    in the same way as a fresh download of MIR-1K.

    MIR-1K also ships with two 'sets' of audio files: the divided and undivided sets. They contain
    the same content, the only difference is that the undivided set is one file per song, each song
    taking up the whole file, and the divided set has the same song divided into segments of ~3-12
    seconds. The :param:`undivided` parameter controls which of these two sets `nussl` will loop
    through.

    Examples:
        Using this generator function to loop through the MIR-1K data set. In this example, we use
        the generator directly in the ``for`` loop.

        .. code-block:: python
            :linenos:

            mir1k_path = '/path/to/MIR-1K'  # the MIR-1K directory in disc
            for mix, vox, acc in nussl.datasets.mir1k(mir1k_path):
                mix.to_mono(overwrite=True)  # sum to mono to make a 'mixture'

                # Get some basic metadata on the files.
                # (They'll all have the same file name, but different labels)
                print('Mixture       - Filename: {}, Label: {}'.format(mix.file_name, mix.label))
                print('Vocals        - Filename: {}, Label: {}'.format(vox.file_name, vox.label))
                print('Accompaniment - Filename: {}, Label: {}'.format(acc.file_name, acc.label))

                # Run an algorithm on the MIR-1K files and save to disc
                r = nussl.Repet(mix)
                r.run()
                bg_est, fg_est = r.make_audio_signals()
                bg_est.write_audio_to_file('{}_bg.wav'.format(os.path.splitext(mix.file_name)[0]))
                fg_est.write_audio_to_file('{}_fg.wav'.format(os.path.splitext(mix.file_name)[0]))

        It's also possible to use ``tqdm`` to print the progress to the console. This is useful
        because running through an entire data set can take a while. Here's a more advanced example
        using some other options as well:

        .. code-block:: python
            :linenos:

            import nussl
            import tdqm

            mir1k_path = 'path/to/MIR-1K' # the MIR-1K directory on disc
            idxs = range(29, 150)[::2]  # Only get every other song between [29, 150)
            mir1k_gen = nussl.datasets.mir1k(mir1k_path, subset=idxs,
                                             check_hash=False, undivided=True)

            # Tell tqdm the number of files we're running on so it can estimate a completion time
            for mixture, vocals, accompaniment in tqdm(mir1k_gen, total=len(idxs)):
                mix.to_mono(overwrite=True)  # sum to mono to make a 'mixture'

                # Run an algorithm on the MIR-1K files and save to disc
                r = nussl.Repet(mix)
                r.run()
                bg_est, fg_est = r.make_audio_signals()
                bg_est.write_audio_to_file('{}_bg.wav'.format(os.path.splitext(mix.file_name)[0]))
                fg_est.write_audio_to_file('{}_fg.wav'.format(os.path.splitext(mix.file_name)[0]))

    Args:
        directory (str): Top-level directory for the MIR-1K data set.
        check_hash (bool, str): In the case that there is a mismatch between the expected and
        calculated hash, if this parameter is ``True`` (a bool) an exception is raised and
        if this parameter is ``'warn'`` (a string) a warning is printed to the console. If
        this parameter is ``False``, the hash will not be calculated for this directory, i.e.,
        this function does nothing.
        subset (float, list, str, None): This parameter determines how to make a subset of the
        audio files in the data set. There are four ways to use it, depending on what type
        this parameter takes:
        1) If :param:`subset` is a ``float``, then :param:`subset` will return the first
        ``X.Y%`` of audio files, where ``X.Y%`` is some arbitrary percentage. In this case,
        :param:`subset` is expected to be in the range [0.0, 1.0].
        2) If :param:`subset` is a ``list``, it is expected to be a list of indices (as
        ``int``s). This function will then produce the audio files in the list that correspond
        to those indices.
        3) If :param:`subset` is a ``str``, it will only include audio files with that string
        somewhere in the directory name.
        4) If :param:`subset` is ``None``, then the whole data set is traversed unobstructed.
        shuffle (bool): Whether the data set should be shuffled.
        seed (int, 1-d array_like): Seed for ``numpy``'s random number generator used for
        shuffling.
        undivided (bool): Whether to use the divided (in the ``Wavefile`` directory) or undivided
        (in the ``UndividedWavefile`` directory).

    Yields:
        (``tuple(AudioSignal, AudioSignal, AudioSignal)``):
            A tuple of three :class:`AudioSignal` objects, with audio loaded for each source. In
            the tuple, they are returned in the following order:
            ``(mixture, vocals, accompaniment)``. In MIR-1K, the audio files are such that the
            vocals are hard panned to one channel and the accompaniment is hard panned to the other.
            So, the 'mixture' yielded here by this function reflects this, and needs to 'mixed'
            down to mono. In other words, ``mixture`` is a stereo :class:`AudioSignal` object,
            where each channel is on source, and similarly ``vocals`` and ``accompaniment`` are
            mono :class:`AudioSignal` objects made from a single channel in `mixture`.

    """

    top_dir_name = 'MIR-1K'

    wavfile_hash = '33c085c1a7028199cd20317868849b413e0971022ebc4aefcf1bbc5516646c29'
    undivided_hash = '3f39af9be17515e042a7005b4c47110c6738623a7fada6233ba104535b7dde1b'

    if undivided:
        audio_dir_name = 'UndividedWavfile'
        mir1k_hash = undivided_hash
    else:
        audio_dir_name = 'Wavfile'
        mir1k_hash = wavfile_hash

    audio_extension = '.wav'
    all_wav_files = _data_set_setup(directory, top_dir_name, audio_dir_name,
                                    mir1k_hash, check_hash, audio_extension)

    all_wav_files = _subset_and_shuffle(all_wav_files, subset, shuffle, seed)

    for f in all_wav_files:
        mixture = AudioSignal(f)
        mixture.label = 'mixture'

        vocals = mixture.make_audio_signal_from_channel(1)
        vocals.label = 'vocals'

        accompaniment = mixture.make_audio_signal_from_channel(0)
        accompaniment.label = 'accompaniment'

        yield mixture, vocals, accompaniment
Пример #5
0
def kam(Inputfile, SourceKernels, Numit=1, SpecParams=np.array([]), FullKernel=False):
    """
    The 'kam' function implements the kernel backfitting algorithm to extract 
    J audio sources from I channel mixtures.
    
    Inputs: 
    Inputfile: (list) It can contain either:                 
               - Up to 3 elements: A string indicating the path of the .wav file containing 
                 the I-channel audio mixture as the first element. The second (optional) 
                 element indicates the length of the portion of the signal to be extracted 
                 in seconds(defult is the full lengths of the siganl) The third (optional) 
                 element indicates the starting point of the portion of the signal to be 
                 extracted (default is 0 sec).
               OR
               - 2 elements: An I-column Numpy matrix containing samples of the time-domain 
                 mixture as the first element and the sampling rate as the second element.
          
    SourceKernels: a list containg J sub-lists, each of which contains properties of 
                   one of source kernels. Kernel properties are:
                   -kernel type: (string) determines whether the kernel is one of the 
                                 pre-defined kernel types or a user-defined lambda function. 
                                 Choices are: 'cross','horizontal','vertical','periodic','userdef'
                   -kparams (for pre-defined kernels): a Numpy matrix containing the numerical 
                             values of the kernel parameters.
                   -knhood (for user-defined kernels): logical lambda function which defines 
                            receives the coordinates of two time-frequency bins and determines
                            whether they are neighbours (outputs TRUE if neighbour).
                   -kwfunc (optional): lambda function which receives the coordinates of two 
                            neighbouring time-frequency bins and computes the weight value at
                            the second bin given its distance from the first bin. The weight
                            values fall in the interval [0,1]. Default: all ones over the 
                            neighbourhood (binary kernel).
    
    Numit: (optional) number of iterations of the backfitting algorithm - default: 1     
                   
    SpecParams: (optional) structured containing spectrogram parameters including:
                         - windowtype (default is Hamming)
                         - windowlength (default is 60 ms)
                         - overlap_samples in [0,windowlength] (default is widowlength/2)
                         - num_fft_bins (default is windowlength)
                         - makeplot in {0,1} (default is 0)
                         - fmaxplot in Hz (default is fs/2)
                         example: 
                         SpecParams=np.zeros(1,dtype=[('windowtype','|S1'),
                                                      ('windowlength',int),
                                                      ('overlap_samples',int),
                                                      ('num_fft_bins',int),
                                                      ('makeplot',int),
                                                      ('fmaxplot',float)])
                         SpecParams['windowlength']=1024   

    FullKernel: (optional) binary input which determines the method used for median filtering.
               If the kernel has a limited support and the same shape over all time-freq. bins,
               then instead of the full kernel method a sliding window can be used in the median 
               filtering stage in order to make the algorithm run faster (linear computational
               complexity). The default value is False.
               A True value means implementing the case where the similarity measure is 
               computed for all possible combinations of TF bins, resulting in quadratic 
               computational complexity, and hence longer running time. 
                                        
        
    Outputs:
    shat: a Ls by I by J Numpy array containing J time-domain source images on I channels   
    fhat: a LF by LT by J Numpy array containing J power spectral dencities      
    """

    # Step (1): 
    # Load the audio mixture from the input path
    if len(Inputfile) == 2:
        Mixture = AudioSignal(audiosig=Inputfile[0], fs=Inputfile[1])
    elif len(Inputfile) == 3:
        Mixture = AudioSignal(file_name=Inputfile[0], siglen=Inputfile[1], sigstart=Inputfile[2])

    if len(SpecParams) != 0:
        for i in range(0, len(SpecParams.dtype)):
            nameTemp = SpecParams.dtype.names[i]
            valTemp = SpecParams[0][i]
            valTemp = str(valTemp)
            exec ('Mixture.' + nameTemp + '=' + valTemp)

    x, tvec = np.array([Mixture.x, Mixture.time])  # time-domain channel mixtures
    X, Px, Fvec, Tvec = Mixture.do_STFT()  # stft and PSD of the channel mixtures

    I = Mixture.numCh  # number of channel mixtures
    J = len(SourceKernels)  # number of sources
    LF = np.size(Fvec)  # length of the frequency vector
    LT = np.size(Tvec)  # length of the timeframe vector

    F_ind = np.arange(LF)  # frequency bin indices
    T_ind = np.arange(LT)  # time frame indices
    Tmesh, Fmesh = np.meshgrid(T_ind, F_ind)  # grid of time-freq indices for the median filtering step
    TFcoords = np.mat(np.zeros((LF * LT, 2), dtype=int))  # all time-freq index combinations
    TFcoords[:, 0] = np.mat(np.asarray(Fmesh.T).reshape(-1)).T
    TFcoords[:, 1] = np.mat(np.asarray(Tmesh.T).reshape(-1)).T


    # Generate source kernels:
    Kj = []
    for ns in range(0, J):
        SKj = SourceKernels[ns]
        if len(SKj) < 2:
            raise Exception('The information required for generating source kernels is insufficient.'
                            ' Each sub-list in SourceKernels must contain at least two elements.')
        KTYPE = SKj[0]
        if KTYPE != 'userdef' and len(SKj) == 2:
            Kj.append(Kernel(Type=SKj[0], ParamVal=SKj[1]))
        elif KTYPE != 'userdef' and len(SKj) == 3:
            Kj.append(Kernel(Type=SKj[0], ParamVal=SKj[1]), Wfunc=SKj[2])
        elif KTYPE == 'userdef' and len(SKj) == 2:
            Kj.append(Kernel(Type=SKj[0], Nhood=SKj[1]))
        elif KTYPE == 'userdef' and len(SKj) == 3:
            Kj.append(Kernel(Type=SKj[0], Nhood=SKj[1]), Wfunc=SKj[2])

    # Step (2): initialization
    # Initialize the PSDs with average mixture PSD and the spatial covarince matricies
    # with identity matrices

    X = np.reshape(X.T, (LF * LT, I))  # reshape the stft tensor into I vectors
    if I > 1:
        MeanPSD = np.mean(Px, axis=2) / (I * J)
    else:
        MeanPSD = Px / (J)
    MeanPSD = np.reshape(MeanPSD.T, (LF * LT, 1))  # reshape the mean PSD matrix into a vector

    fj = np.zeros((LF * LT, I * I, J))
    for j in range(0, J):
        fj[:, :, j] = np.tile(MeanPSD, (1, I * I))  # initialize by mean PSD

    Rj = 1j * np.zeros((1, I * I, J))
    for j in range(0, J):
        Rj[0, :, j] = np.reshape(np.eye(I), (1, I * I))
    Rj = np.tile(Rj, (LF * LT, 1, 1))

    ### Kernel Backfitting ###

    # start_time = time.clock()

    S = 1j * np.zeros((LF * LT, I, J))
    for n in range(0, Numit):

        # Step (3):
        # compute the inverse term: [sum_j' f_j' R_j']^-1
        SumFR = np.sum(fj * Rj, axis=2)  ###  !!!!!!!!!! careful about memory storage!
        SumFR.shape = (LF * LT, I, I)
        SumFR += 1e-16 * np.random.randn(LF * LT, I, I)  # to avoid singularity issues

        InvSumFR = np.zeros((LF * LT, I, I), dtype='single')
        if I == 1:
            InvSumFR = 1 / SumFR
        elif I == 2:
            InvDet = 1 / (SumFR[:, 0, 0] * SumFR[1, 1] - SumFR[0, 1] * SumFR[1, 0])
            InvSumFR[:, 0, 0] = InvDet * SumFR[:, 1, 1]
            InvSumFR[:, 0, 1] = -InvDet * SumFR[:, 0, 1]
            InvSumFR[:, 1, 0] = -InvDet * SumFR[:, 1, 0]
            InvSumFR[:, 1, 1] = InvDet * SumFR[:, 0, 0]
        else:
            InvSumFR = np.linalg.inv(SumFR)
        InvSumFR.shape = (LF * LT, I * I)

        # compute sources, update PSDs and covariance matrices 
        for ns in range(0, J):
            FRinvsum = fj[:, :, ns] * Rj[:, :, ns] * InvSumFR
            Stemp = 1j * np.zeros((LF * LT, I))
            for nch in range(0, I):
                FRtemp = FRinvsum[:, nch * I:nch * I + 2]
                Stemp[:, nch] = np.sum(FRtemp * X, axis=1)
            S[:, :, ns] = Stemp

            # Step (4-a):
            Cj = np.repeat(Stemp, I, axis=1) * np.tile(np.conj(Stemp), (1, I))

            # Step (4-b):
            Cj_reshape = np.reshape(Cj, (LF * LT, I, I))
            Cj_trace = np.mat(np.matrix.trace(Cj_reshape.T)).T
            MeanCj = Cj / np.tile(Cj_trace, (1, I * I))
            MeanCj_reshape = np.reshape(np.array(MeanCj), (LF, LT, I * I), order='F')
            Rj[:, :, ns] = np.tile(np.sum(MeanCj_reshape, axis=1), (LT, 1))

            # Step (4-c):
            # Note: the summation over 't' at step 4-c in the 2014 paper is a typo!
            #       the correct formulation of zj is:
            #       zj=(1/I)*tr(inv(Rj(w)Cj(w,t)
            Rj_reshape = np.reshape(Rj[:, :, ns], (LF * LT, I, I))
            Rj_reshape += 1e-16 * np.random.randn(LF * LT, I, I)

            InvRj = np.zeros((LF * LT, I, I), dtype='single')
            if I == 1:
                InvRj = 1 / Rj_reshape
            elif I == 2:
                InvDetR = 1 / (Rj_reshape[:, 0, 0] * Rj_reshape[1, 1] - Rj_reshape[0, 1] * Rj_reshape[1, 0])
                InvRj[:, 0, 0] = InvDetR * Rj_reshape[:, 1, 1]
                InvRj[:, 0, 1] = -InvDetR * Rj_reshape[:, 0, 1]
                InvRj[:, 1, 0] = -InvDetR * Rj_reshape[:, 1, 0]
                InvRj[:, 1, 1] = InvDetR * Rj_reshape[:, 0, 0]
            else:
                InvRj = np.linalg.inv(Rj_reshape)
            InvRj.shape = (LF * LT, I * I)

            InvRjCj = np.reshape(InvRj * Cj, (LF * LT, I, I))
            zj = np.real(np.matrix.trace(InvRjCj.T) / I)
            zj = np.mat(zj)

            # Step (4-d):
            # Median filter the estimated PSDs

            # start_time = time.clock()
            Ktemp = Kj[ns]  # Kernel corresponding to the source #j

            if not FullKernel:  # kernel defined as a sliding window (faster method)
                centerF = (LF - np.mod(LF, 2)) / 2  # middle freq. bin
                centerT = (LT - np.mod(LT, 2)) / 2  # middle time frame
                centerTF = np.mat([centerF, centerT])  # middle time-freq. bin
                KWin = Ktemp.sim(centerTF, TFcoords)  # sliding kernel window
                KWin_reshape = np.reshape(KWin, (LT, LF)).T
                NZ = np.nonzero(KWin_reshape)  # range of element numbers of central nonzero elements
                KWin_shrink = KWin_reshape[NZ[0].min():NZ[0].max() + 1,
                              NZ[1].min():NZ[1].max() + 1]  # extract the central nonzero part
                ZMed = scipy.ndimage.filters.median_filter(np.reshape(zj, (LT, LF)).T,
                                                           footprint=KWin_shrink)  # median filter
                fj[:, :, ns] = np.reshape(ZMed.T, (LF * LT, 1))

            else:  # full kernel method (more general but slower approach)
                for ft in range(0, LF * LT):
                    simTemp = Ktemp.sim(TFcoords[ft, :], TFcoords)
                    NhoodTemp = np.nonzero(simTemp)
                    zjNhood = np.multiply(zj[NhoodTemp], simTemp[NhoodTemp])
                    fj[ft, :, ns] = np.median(np.array(zjNhood))

                    # print time.clock() - start_time, "seconds"


    # print time.clock() - start_time, "seconds"

    # Reshape the PSDs
    fhat = np.zeros((LF, LT, J))
    for ns in range(0, J):
        fhat[:, :, ns] = np.reshape(fj[:, 0, ns], (LT, LF)).T

        # Reshape the spectrograms
    Shat = 1j * np.zeros((LF, LT, I, J))  # estimated source STFTs
    for ns in range(0, J):
        for nch in range(0, I):
            Shat[:, :, nch, ns] = np.reshape(S[:, nch, ns], (LT, LF)).T


    # Compute the inverse stft of the estimated sources
    shat = np.zeros((x.shape[0], I, J))
    sigTemp = AudioSignal()
    sigTemp.windowtype = Mixture.windowtype
    sigTemp.windowlength = Mixture.windowlength
    sigTemp.overlap_samples = Mixture.overlap_samples
    sigTemp.numCh = I
    for ns in range(0, J):
        sigTemp.X = Shat[:, :, :, ns]
        shat[:, :, ns] = sigTemp.istft()[0][0:x.shape[0]]

    return shat, fhat