def run(self): """ Runs the original REPET algorithm Returns: background (AudioSignal): An AudioSignal object with repeating background in background.audio_data (to get the corresponding non-repeating foreground run self.make_audio_signals()) Example: :: signal = nussl.AudioSignal(path_to_input_file='input_name.wav') # Set up and run Repet repet = nussl.Repet(signal) repet.run() # or repet() # Get audio signals background, foreground = repet.make_audio_signals() # output the background background.write_ """ # High pass filter cutoff freq. (in # of freq. bins), +1 to match MATLAB implementation self.high_pass_cutoff = int(np.ceil(self.high_pass_cutoff * (self.stft_params.n_fft_bins - 1) / self.audio_signal.sample_rate)) + 1 # the MATLAB implementation had low = 1 if self.matlab_fidelity else 0 self._compute_spectrum() self.repeating_period = self._calculate_repeating_period() # separate the mixture background by masking background_stft = [] for i in range(self.audio_signal.num_channels): repeating_mask = self._compute_repeating_mask(self.magnitude_spectrogram[:, :, i]) repeating_mask[low:self.high_pass_cutoff, :] = 1 # high-pass filter the foreground # apply mask stft_with_mask = repeating_mask * self.stft[:, :, i] background_stft.append(stft_with_mask) background_stft = np.array(background_stft).transpose((1, 2, 0)) self.background = AudioSignal(stft=background_stft, sample_rate=self.audio_signal.sample_rate) self.background.istft(self.stft_params.window_length, self.stft_params.hop_length, self.stft_params.window_type, overwrite=True, use_librosa=self.use_librosa_stft) return self.background
def _make_audio_signal_from_stem(filename, stem, i, sr, label): """ Reads in a :param:`stem` from the stempeg library (`stempeg.read_stems()`) and creates a correctly formatted :class:`AudioSignal` object (with all the metadata set up). Args: filename (str): Name of the file on disc. stem (:obj:`np.ndarray`): Numpy array from the `stempeg.read_stems()` function. i (int): Index of the :param:`stem: array to get audio data from. sr (int): Sample rate. label (str): Label for the :class:`AudioSignal` object. Returns: (:obj:`AudioSignal`) Correctly formatted :class:`AudioSignal` object with the right metadata. """ signal = AudioSignal(audio_data_array=stem[i,...], sample_rate=sr) signal.path_to_input_file = filename signal.label = label return signal
class Repet(separation_base.SeparationBase): """Implements the original REpeating Pattern Extraction Technique algorithm using the beat spectrum. REPET is a simple method for separating a repeating background from a non-repeating foreground in an audio mixture. It assumes a single repeating period over the whole signal duration, and finds that period based on finding a peak in the beat spectrum. The period can also be provided exactly, or you can give ``Repet`` a guess of the min and max period. Once it has a period, it "overlays" spectrogram sections of length ``period`` to create a median model (the background). References: * Zafar Rafii and Bryan Pardo. "Audio Separation System and Method," US20130064379 A1, US 13/612,413, March 14, 2013 See Also: http://music.eecs.northwestern.edu/research.php?project=repet Parameters: input_audio_signal (:obj:`AudioSignal`): The ``AudioSignal`` object that REPET will be run on. This makes a copy of ``input_audio_signal`` min_period (float, optional): minimum time to look for repeating period in terms of seconds. max_period (float, optional): maximum time to look for repeating period in terms of seconds. period (float, optional): exact time that the repeating period is (in seconds). high_pass_cutoff (float, optional): value (in Hz) for the high pass cutoff filter. do_mono (bool, optional): Flattens ``AudioSignal`` to mono before running the algorithm (does not effect the input ``AudioSignal`` object) use_find_period_complex (bool, optional): Will use a more complex peak picker to find the repeating period use_librosa_stft (bool, optional): Calls librosa's stft function instead of nussl's matlab_fidelity (bool, optional): If True, does repet with the same settings as the original MATLAB implementation of REPET, warts and all. This will override ``use_librosa_stft`` and set it to ``False``. Examples: :ref:`The REPET Demo Example <repet_demo>` Attributes: background (:obj:`AudioSignal`): Calculated background. This is ``None`` until ``run()`` is called. foreground (:obj:`AudioSignal`): Calculated foreground. This is ``None`` until ``make_audio_signals()`` is called. beat_spectrum (:obj:`np.array`): Beat spectrum calculated by Repet. """ def __init__(self, input_audio_signal, min_period=None, max_period=None, period=None, high_pass_cutoff=None, do_mono=False, use_find_period_complex=False, use_librosa_stft=config.USE_LIBROSA_STFT, matlab_fidelity=False): super(Repet, self).__init__(input_audio_signal=input_audio_signal) self.high_pass_cutoff = 100.0 if high_pass_cutoff is None else float(high_pass_cutoff) self.background = None self.foreground = None self.beat_spectrum = None self.use_find_period_complex = use_find_period_complex self.use_librosa_stft = use_librosa_stft self.repeating_period = None self.magnitude_spectrogram = None self.stft = None self.repeating_period = None self.matlab_fidelity = matlab_fidelity self._is_period_converted_to_hops = False if self.matlab_fidelity: self.use_librosa_stft = False # TODO: stereo doesn't do true stereo REPET (see TODO below) if do_mono: self.audio_signal.to_mono(overwrite=True) if (min_period or max_period) and period: raise ValueError('Cannot set both period and (min_period or max_period)!') # Set period parameters self.min_period, self.max_period, self.period = None, None, None if period is None: self.min_period = 0.8 if min_period is None else min_period self.max_period = min(8, self.audio_signal.signal_duration / 3) if max_period is None else max_period else: self.period = period if not self._is_period_converted_to_hops: self.period = self._update_period(self.period) self._is_period_converted_to_hops = True def run(self): """ Runs the original REPET algorithm Returns: background (AudioSignal): An AudioSignal object with repeating background in background.audio_data (to get the corresponding non-repeating foreground run self.make_audio_signals()) Example: :: signal = nussl.AudioSignal(path_to_input_file='input_name.wav') # Set up and run Repet repet = nussl.Repet(signal) repet.run() # or repet() # Get audio signals background, foreground = repet.make_audio_signals() # output the background background.write_ """ # High pass filter cutoff freq. (in # of freq. bins), +1 to match MATLAB implementation self.high_pass_cutoff = int(np.ceil(self.high_pass_cutoff * (self.stft_params.n_fft_bins - 1) / self.audio_signal.sample_rate)) + 1 # the MATLAB implementation had low = 1 if self.matlab_fidelity else 0 self._compute_spectrum() self.repeating_period = self._calculate_repeating_period() # separate the mixture background by masking background_stft = [] for i in range(self.audio_signal.num_channels): repeating_mask = self._compute_repeating_mask(self.magnitude_spectrogram[:, :, i]) repeating_mask[low:self.high_pass_cutoff, :] = 1 # high-pass filter the foreground # apply mask stft_with_mask = repeating_mask * self.stft[:, :, i] background_stft.append(stft_with_mask) background_stft = np.array(background_stft).transpose((1, 2, 0)) self.background = AudioSignal(stft=background_stft, sample_rate=self.audio_signal.sample_rate) self.background.istft(self.stft_params.window_length, self.stft_params.hop_length, self.stft_params.window_type, overwrite=True, use_librosa=self.use_librosa_stft) return self.background def _compute_spectrum(self): self.stft = self.audio_signal.stft(overwrite=True, remove_reflection=True, use_librosa=self.use_librosa_stft) self.magnitude_spectrogram = np.abs(self.stft) def get_beat_spectrum(self, recompute_stft=False): """Calculates and returns the beat spectrum for the audio signal associated with this object Args: recompute_stft: (Optional) (bool) Recompute the stft for the audio signal Returns: beat_spectrum (np.array): beat spectrum for the audio file EXAMPLE:: # Set up audio signal signal = nussl.AudioSignal('path_to_file.wav') # Set up a Repet object repet = nussl.Repet(signal) # I don't have to run repet to get a beat spectrum for signal beat_spec = repet.get_beat_spectrum() """ if recompute_stft or self.magnitude_spectrogram is None: self._compute_spectrum() # TODO: Make this multi-channel. The np.mean() reduces the n channels to 1. self.beat_spectrum = self.compute_beat_spectrum(np.mean(np.square(self.magnitude_spectrogram), axis=self.audio_signal._STFT_CHAN).T) return self.beat_spectrum def _calculate_repeating_period(self): # user provided a period, so no calculations to do if self.period is not None: return self.period # get beat spectrum self.beat_spectrum = self.get_beat_spectrum() if self.use_find_period_complex: self.repeating_period = self.find_repeating_period_complex(self.beat_spectrum) else: # update the min and max so they're in units of frequency bin indices if not self._is_period_converted_to_hops: self.min_period = self._update_period(self.min_period) self.max_period = self._update_period(self.max_period) self._is_period_converted_to_hops = True self.repeating_period = self.find_repeating_period_simple(self.beat_spectrum, self.min_period, self.max_period) return self.repeating_period @staticmethod def compute_beat_spectrum(power_spectrum): """Computes the beat spectrum averages (over freq's) the autocorrelation matrix of a one-sided spectrogram. The autocorrelation matrix is computed by taking the autocorrelation of each row of the spectrogram and dismissing the symmetric half. Parameters: power_spectrum (np.array): 2D matrix containing the one-sided power spectrogram of the audio signal (Lf by Lt by num channels) Returns: (np.array): array containing the beat spectrum based on the power spectrogram """ freq_bins, time_bins = power_spectrum.shape # row-wise autocorrelation according to the Wiener-Khinchin theorem power_spectrum = np.vstack([power_spectrum, np.zeros_like(power_spectrum)]) fft_power_spec = scifft.fft(power_spectrum, axis=0) abs_fft = np.abs(fft_power_spec) ** 2 autocorrelation_rows = np.real(scifft.ifft(abs_fft, axis=0)[:freq_bins, :]) # ifft over columns # normalization factor norm_factor = np.tile(np.arange(freq_bins, 0, -1), (time_bins, 1)).T autocorrelation_rows = autocorrelation_rows / norm_factor # compute the beat spectrum beat_spectrum = np.mean(autocorrelation_rows, axis=1) # average over frequencies return beat_spectrum @staticmethod def find_repeating_period_simple(beat_spectrum, min_period, max_period): """Computes the repeating period of the sound signal using the beat spectrum. This algorithm just looks for the max value in the interval [min_period, max_period] inclusive. It discards the first value, and returns the period in units of stft time bins. Parameters: beat_spectrum (np.array): input beat spectrum array min_period (int): minimum possible period value max_period (int): maximum possible period value Returns: period (int) : The period of the sound signal in stft time bins """ min_period, max_period = int(min_period), int(max_period) beat_spectrum = beat_spectrum[1:] # discard the first element of beat_spectrum (lag 0) beat_spectrum = beat_spectrum[min_period - 1: max_period] period = np.argmax(beat_spectrum) + min_period return period @staticmethod def find_repeating_period_complex(beat_spectrum): """ Args: beat_spectrum: Returns: """ auto_cosine = np.zeros((len(beat_spectrum), 1)) for i in range(0, len(beat_spectrum) - 1): auto_cosine[i] = 1 - scipy.spatial.distance.cosine(beat_spectrum[0:len(beat_spectrum) - i], beat_spectrum[i:len(beat_spectrum)]) ac = auto_cosine[0:np.floor(auto_cosine.shape[0])/2] auto_cosine = np.vstack([ac[1], ac, ac[-2]]) auto_cosine_diff = np.ediff1d(auto_cosine) sign_changes = auto_cosine_diff[0:-1]*auto_cosine_diff[1:] sign_changes = np.where(sign_changes < 0)[0] extrema_values = ac[sign_changes] e1 = np.insert(extrema_values, 0, extrema_values[0]) e2 = np.insert(extrema_values, -1, extrema_values[-1]) extrema_neighbors = np.stack((e1[0:-1], e2[1:])) m = np.amax(extrema_neighbors, axis=0) extrema_values = extrema_values.flatten() maxima = np.where(extrema_values >= m)[0] maxima = zip(sign_changes[maxima], extrema_values[maxima]) maxima = maxima[1:] maxima = sorted(maxima, key=lambda x: -x[1]) period = maxima[0][0] return period def _compute_repeating_mask(self, magnitude_spectrogram_channel): """Computes the soft mask for the repeating part using the magnitude spectrogram and the repeating period Parameters: magnitude_spectrogram_channel (np.array): 2D matrix containing the magnitude spectrogram of a signal Returns: M (np.array): 2D matrix (Lf by Lt) containing the soft mask for the repeating part, elements of M take on values in [0,1] """ # this +1 is a kluge to make this implementation match the original MATLAB implementation period = self.repeating_period + 1 freq_bins, time_bins = magnitude_spectrogram_channel.shape n_repetitions = int(np.ceil(float(time_bins) / period)) one_period = freq_bins * period # Pad to make an integer number of repetitions. Pad with 'nan's to not affect the median. remainder = (period * n_repetitions) % time_bins mask_reshaped = np.hstack([magnitude_spectrogram_channel, float('nan') * np.zeros((freq_bins, remainder))]) # reshape to take the median of each period mask_reshaped = np.reshape(mask_reshaped.T, (n_repetitions, one_period)) # take median of repeating periods before and after the padding median_mask = np.nanmedian(mask_reshaped, axis=0) # reshape to it's original shape median_mask = np.reshape(np.tile(median_mask, (n_repetitions, 1)), (n_repetitions * period, freq_bins)).T median_mask = median_mask[:, :time_bins] # take minimum of computed mask and original input and scale min_median_mask = np.minimum(median_mask, magnitude_spectrogram_channel) mask = (min_median_mask + constants.EPSILON) / (magnitude_spectrogram_channel + constants.EPSILON) return mask def update_periods(self): """ Will update periods for use with ``self.find_repeating_period_simple()``. Updates from seconds to stft bin values. Call this if you haven't done ``self.run()`` or else you won't get good results. Examples: :: a = nussl.AudioSignal('path/to/file.wav') r = nussl.Repet(a) beat_spectrum = r.get_beat_spectrum() r.update_periods() repeating_period = r.find_repeating_period_simple(beat_spectrum, r.min_period, r.max_period) """ if self._is_period_converted_to_hops: self.period = self._update_period(self.period) if self.period is not None else None self.min_period = self._update_period(self.min_period) if self.min_period is not None else None self.max_period = self._update_period(self.max_period) if self.max_period is not None else None self._is_period_converted_to_hops = True def _update_period(self, period): period = float(period) result = period * self.audio_signal.sample_rate result += self.stft_params.window_length / self.stft_params.window_overlap - 1 result /= self.stft_params.window_overlap return int(np.ceil(result)) def plot(self, output_file, **kwargs): """ Creates a plot of the beat spectrum and outputs to output_file. Parameters: output_file (string) : string representing a path to the desired output file to be created. title: (string) Title to put on the plot show_repeating_period: (bool) if True, then adds a vertical line where repet things the repeating period is (if the repeating period has been computed already) EXAMPLE: :: signal = nussl.AudioSignal('Sample.wav') repet = nussl.Repet(signal) repet.plot('new_beat_spec_plot.png', title="Beat Spectrum of Sample.wav", show_repeating_period=True) """ import matplotlib.pyplot as plt plt.close('all') title = None show_repeating_period = False if len(kwargs) != 0: if 'title' in kwargs: title = kwargs['title'] if 'show_repeating_period' in kwargs: show_repeating_period = kwargs['show_repeating_period'] beat_spec = self.get_beat_spectrum() time_vect = np.linspace(0.0, self.audio_signal.signal_duration, num=len(beat_spec)) plt.plot(time_vect, beat_spec) if self.repeating_period is not None and show_repeating_period: stft_vector = np.linspace(0.0, self.audio_signal.signal_duration, self.audio_signal.stft_length) rep = stft_vector[self.repeating_period] plt.plot((rep, rep), (0, np.max(beat_spec)), 'g--', label='Repeating period') # plt.plot((self.repeating_period, self.repeating_period), (-1e20, 1e20), 'g--') plt.ylim((0.0, np.max(beat_spec) * 1.1)) title = title if title is not None else 'Beat Spectrum for {}'.format(self.audio_signal.file_name) plt.title(title) plt.xlabel('Time (s)') plt.ylabel('Beat Strength') plt.grid('on') plt.axis('tight') plt.savefig(output_file) def make_audio_signals(self): """ Returns the background and foreground audio signals. You must have run Repet.run() prior to calling this function. This function will return None if run() has not been called. Returns: Audio Signals (List): 2 element list. * bkgd: Audio signal with the calculated background track * fkgd: Audio signal with the calculated foreground track EXAMPLE: :: # set up AudioSignal object signal = nussl.AudioSignal('path_to_file.wav') # set up and run repet repet = nussl.Repet(signal) repet.run() # get audio signals (AudioSignal objects) background, foreground = repet.make_audio_signals() """ if self.background is None: return None self.foreground = self.audio_signal - self.background self.foreground.sample_rate = self.audio_signal.sample_rate return [self.background, self.foreground]
def mir1k(directory, check_hash=True, subset=None, shuffle=False, seed=None, undivided=False): """ Generator function for the MIR-1K data set. This allows you to loop through the entire data set with only a few :class:`AudioSignal` objects stored in memory at a time. There are options for only looping through a subset of the data set and shuffling the data set (with a seed). See details about those options below. `nussl` calculates the hash of the MIR-1K directory and compares it against a precomputed hash for MIR-1K that ships with `nussl`. This hash is used to verify that `nussl` understands the directory structure when reading the files. Calculating the hash can be turned off if the user needs a speed up, but this might cause oblique errors if the MIR-1K directory is not set up in the same way as a fresh download of MIR-1K. MIR-1K also ships with two 'sets' of audio files: the divided and undivided sets. They contain the same content, the only difference is that the undivided set is one file per song, each song taking up the whole file, and the divided set has the same song divided into segments of ~3-12 seconds. The :param:`undivided` parameter controls which of these two sets `nussl` will loop through. Examples: Using this generator function to loop through the MIR-1K data set. In this example, we use the generator directly in the ``for`` loop. .. code-block:: python :linenos: mir1k_path = '/path/to/MIR-1K' # the MIR-1K directory in disc for mix, vox, acc in nussl.datasets.mir1k(mir1k_path): mix.to_mono(overwrite=True) # sum to mono to make a 'mixture' # Get some basic metadata on the files. # (They'll all have the same file name, but different labels) print('Mixture - Filename: {}, Label: {}'.format(mix.file_name, mix.label)) print('Vocals - Filename: {}, Label: {}'.format(vox.file_name, vox.label)) print('Accompaniment - Filename: {}, Label: {}'.format(acc.file_name, acc.label)) # Run an algorithm on the MIR-1K files and save to disc r = nussl.Repet(mix) r.run() bg_est, fg_est = r.make_audio_signals() bg_est.write_audio_to_file('{}_bg.wav'.format(os.path.splitext(mix.file_name)[0])) fg_est.write_audio_to_file('{}_fg.wav'.format(os.path.splitext(mix.file_name)[0])) It's also possible to use ``tqdm`` to print the progress to the console. This is useful because running through an entire data set can take a while. Here's a more advanced example using some other options as well: .. code-block:: python :linenos: import nussl import tdqm mir1k_path = 'path/to/MIR-1K' # the MIR-1K directory on disc idxs = range(29, 150)[::2] # Only get every other song between [29, 150) mir1k_gen = nussl.datasets.mir1k(mir1k_path, subset=idxs, check_hash=False, undivided=True) # Tell tqdm the number of files we're running on so it can estimate a completion time for mixture, vocals, accompaniment in tqdm(mir1k_gen, total=len(idxs)): mix.to_mono(overwrite=True) # sum to mono to make a 'mixture' # Run an algorithm on the MIR-1K files and save to disc r = nussl.Repet(mix) r.run() bg_est, fg_est = r.make_audio_signals() bg_est.write_audio_to_file('{}_bg.wav'.format(os.path.splitext(mix.file_name)[0])) fg_est.write_audio_to_file('{}_fg.wav'.format(os.path.splitext(mix.file_name)[0])) Args: directory (str): Top-level directory for the MIR-1K data set. check_hash (bool, str): In the case that there is a mismatch between the expected and calculated hash, if this parameter is ``True`` (a bool) an exception is raised and if this parameter is ``'warn'`` (a string) a warning is printed to the console. If this parameter is ``False``, the hash will not be calculated for this directory, i.e., this function does nothing. subset (float, list, str, None): This parameter determines how to make a subset of the audio files in the data set. There are four ways to use it, depending on what type this parameter takes: 1) If :param:`subset` is a ``float``, then :param:`subset` will return the first ``X.Y%`` of audio files, where ``X.Y%`` is some arbitrary percentage. In this case, :param:`subset` is expected to be in the range [0.0, 1.0]. 2) If :param:`subset` is a ``list``, it is expected to be a list of indices (as ``int``s). This function will then produce the audio files in the list that correspond to those indices. 3) If :param:`subset` is a ``str``, it will only include audio files with that string somewhere in the directory name. 4) If :param:`subset` is ``None``, then the whole data set is traversed unobstructed. shuffle (bool): Whether the data set should be shuffled. seed (int, 1-d array_like): Seed for ``numpy``'s random number generator used for shuffling. undivided (bool): Whether to use the divided (in the ``Wavefile`` directory) or undivided (in the ``UndividedWavefile`` directory). Yields: (``tuple(AudioSignal, AudioSignal, AudioSignal)``): A tuple of three :class:`AudioSignal` objects, with audio loaded for each source. In the tuple, they are returned in the following order: ``(mixture, vocals, accompaniment)``. In MIR-1K, the audio files are such that the vocals are hard panned to one channel and the accompaniment is hard panned to the other. So, the 'mixture' yielded here by this function reflects this, and needs to 'mixed' down to mono. In other words, ``mixture`` is a stereo :class:`AudioSignal` object, where each channel is on source, and similarly ``vocals`` and ``accompaniment`` are mono :class:`AudioSignal` objects made from a single channel in `mixture`. """ top_dir_name = 'MIR-1K' wavfile_hash = '33c085c1a7028199cd20317868849b413e0971022ebc4aefcf1bbc5516646c29' undivided_hash = '3f39af9be17515e042a7005b4c47110c6738623a7fada6233ba104535b7dde1b' if undivided: audio_dir_name = 'UndividedWavfile' mir1k_hash = undivided_hash else: audio_dir_name = 'Wavfile' mir1k_hash = wavfile_hash audio_extension = '.wav' all_wav_files = _data_set_setup(directory, top_dir_name, audio_dir_name, mir1k_hash, check_hash, audio_extension) all_wav_files = _subset_and_shuffle(all_wav_files, subset, shuffle, seed) for f in all_wav_files: mixture = AudioSignal(f) mixture.label = 'mixture' vocals = mixture.make_audio_signal_from_channel(1) vocals.label = 'vocals' accompaniment = mixture.make_audio_signal_from_channel(0) accompaniment.label = 'accompaniment' yield mixture, vocals, accompaniment
def kam(Inputfile, SourceKernels, Numit=1, SpecParams=np.array([]), FullKernel=False): """ The 'kam' function implements the kernel backfitting algorithm to extract J audio sources from I channel mixtures. Inputs: Inputfile: (list) It can contain either: - Up to 3 elements: A string indicating the path of the .wav file containing the I-channel audio mixture as the first element. The second (optional) element indicates the length of the portion of the signal to be extracted in seconds(defult is the full lengths of the siganl) The third (optional) element indicates the starting point of the portion of the signal to be extracted (default is 0 sec). OR - 2 elements: An I-column Numpy matrix containing samples of the time-domain mixture as the first element and the sampling rate as the second element. SourceKernels: a list containg J sub-lists, each of which contains properties of one of source kernels. Kernel properties are: -kernel type: (string) determines whether the kernel is one of the pre-defined kernel types or a user-defined lambda function. Choices are: 'cross','horizontal','vertical','periodic','userdef' -kparams (for pre-defined kernels): a Numpy matrix containing the numerical values of the kernel parameters. -knhood (for user-defined kernels): logical lambda function which defines receives the coordinates of two time-frequency bins and determines whether they are neighbours (outputs TRUE if neighbour). -kwfunc (optional): lambda function which receives the coordinates of two neighbouring time-frequency bins and computes the weight value at the second bin given its distance from the first bin. The weight values fall in the interval [0,1]. Default: all ones over the neighbourhood (binary kernel). Numit: (optional) number of iterations of the backfitting algorithm - default: 1 SpecParams: (optional) structured containing spectrogram parameters including: - windowtype (default is Hamming) - windowlength (default is 60 ms) - overlap_samples in [0,windowlength] (default is widowlength/2) - num_fft_bins (default is windowlength) - makeplot in {0,1} (default is 0) - fmaxplot in Hz (default is fs/2) example: SpecParams=np.zeros(1,dtype=[('windowtype','|S1'), ('windowlength',int), ('overlap_samples',int), ('num_fft_bins',int), ('makeplot',int), ('fmaxplot',float)]) SpecParams['windowlength']=1024 FullKernel: (optional) binary input which determines the method used for median filtering. If the kernel has a limited support and the same shape over all time-freq. bins, then instead of the full kernel method a sliding window can be used in the median filtering stage in order to make the algorithm run faster (linear computational complexity). The default value is False. A True value means implementing the case where the similarity measure is computed for all possible combinations of TF bins, resulting in quadratic computational complexity, and hence longer running time. Outputs: shat: a Ls by I by J Numpy array containing J time-domain source images on I channels fhat: a LF by LT by J Numpy array containing J power spectral dencities """ # Step (1): # Load the audio mixture from the input path if len(Inputfile) == 2: Mixture = AudioSignal(audiosig=Inputfile[0], fs=Inputfile[1]) elif len(Inputfile) == 3: Mixture = AudioSignal(file_name=Inputfile[0], siglen=Inputfile[1], sigstart=Inputfile[2]) if len(SpecParams) != 0: for i in range(0, len(SpecParams.dtype)): nameTemp = SpecParams.dtype.names[i] valTemp = SpecParams[0][i] valTemp = str(valTemp) exec ('Mixture.' + nameTemp + '=' + valTemp) x, tvec = np.array([Mixture.x, Mixture.time]) # time-domain channel mixtures X, Px, Fvec, Tvec = Mixture.do_STFT() # stft and PSD of the channel mixtures I = Mixture.numCh # number of channel mixtures J = len(SourceKernels) # number of sources LF = np.size(Fvec) # length of the frequency vector LT = np.size(Tvec) # length of the timeframe vector F_ind = np.arange(LF) # frequency bin indices T_ind = np.arange(LT) # time frame indices Tmesh, Fmesh = np.meshgrid(T_ind, F_ind) # grid of time-freq indices for the median filtering step TFcoords = np.mat(np.zeros((LF * LT, 2), dtype=int)) # all time-freq index combinations TFcoords[:, 0] = np.mat(np.asarray(Fmesh.T).reshape(-1)).T TFcoords[:, 1] = np.mat(np.asarray(Tmesh.T).reshape(-1)).T # Generate source kernels: Kj = [] for ns in range(0, J): SKj = SourceKernels[ns] if len(SKj) < 2: raise Exception('The information required for generating source kernels is insufficient.' ' Each sub-list in SourceKernels must contain at least two elements.') KTYPE = SKj[0] if KTYPE != 'userdef' and len(SKj) == 2: Kj.append(Kernel(Type=SKj[0], ParamVal=SKj[1])) elif KTYPE != 'userdef' and len(SKj) == 3: Kj.append(Kernel(Type=SKj[0], ParamVal=SKj[1]), Wfunc=SKj[2]) elif KTYPE == 'userdef' and len(SKj) == 2: Kj.append(Kernel(Type=SKj[0], Nhood=SKj[1])) elif KTYPE == 'userdef' and len(SKj) == 3: Kj.append(Kernel(Type=SKj[0], Nhood=SKj[1]), Wfunc=SKj[2]) # Step (2): initialization # Initialize the PSDs with average mixture PSD and the spatial covarince matricies # with identity matrices X = np.reshape(X.T, (LF * LT, I)) # reshape the stft tensor into I vectors if I > 1: MeanPSD = np.mean(Px, axis=2) / (I * J) else: MeanPSD = Px / (J) MeanPSD = np.reshape(MeanPSD.T, (LF * LT, 1)) # reshape the mean PSD matrix into a vector fj = np.zeros((LF * LT, I * I, J)) for j in range(0, J): fj[:, :, j] = np.tile(MeanPSD, (1, I * I)) # initialize by mean PSD Rj = 1j * np.zeros((1, I * I, J)) for j in range(0, J): Rj[0, :, j] = np.reshape(np.eye(I), (1, I * I)) Rj = np.tile(Rj, (LF * LT, 1, 1)) ### Kernel Backfitting ### # start_time = time.clock() S = 1j * np.zeros((LF * LT, I, J)) for n in range(0, Numit): # Step (3): # compute the inverse term: [sum_j' f_j' R_j']^-1 SumFR = np.sum(fj * Rj, axis=2) ### !!!!!!!!!! careful about memory storage! SumFR.shape = (LF * LT, I, I) SumFR += 1e-16 * np.random.randn(LF * LT, I, I) # to avoid singularity issues InvSumFR = np.zeros((LF * LT, I, I), dtype='single') if I == 1: InvSumFR = 1 / SumFR elif I == 2: InvDet = 1 / (SumFR[:, 0, 0] * SumFR[1, 1] - SumFR[0, 1] * SumFR[1, 0]) InvSumFR[:, 0, 0] = InvDet * SumFR[:, 1, 1] InvSumFR[:, 0, 1] = -InvDet * SumFR[:, 0, 1] InvSumFR[:, 1, 0] = -InvDet * SumFR[:, 1, 0] InvSumFR[:, 1, 1] = InvDet * SumFR[:, 0, 0] else: InvSumFR = np.linalg.inv(SumFR) InvSumFR.shape = (LF * LT, I * I) # compute sources, update PSDs and covariance matrices for ns in range(0, J): FRinvsum = fj[:, :, ns] * Rj[:, :, ns] * InvSumFR Stemp = 1j * np.zeros((LF * LT, I)) for nch in range(0, I): FRtemp = FRinvsum[:, nch * I:nch * I + 2] Stemp[:, nch] = np.sum(FRtemp * X, axis=1) S[:, :, ns] = Stemp # Step (4-a): Cj = np.repeat(Stemp, I, axis=1) * np.tile(np.conj(Stemp), (1, I)) # Step (4-b): Cj_reshape = np.reshape(Cj, (LF * LT, I, I)) Cj_trace = np.mat(np.matrix.trace(Cj_reshape.T)).T MeanCj = Cj / np.tile(Cj_trace, (1, I * I)) MeanCj_reshape = np.reshape(np.array(MeanCj), (LF, LT, I * I), order='F') Rj[:, :, ns] = np.tile(np.sum(MeanCj_reshape, axis=1), (LT, 1)) # Step (4-c): # Note: the summation over 't' at step 4-c in the 2014 paper is a typo! # the correct formulation of zj is: # zj=(1/I)*tr(inv(Rj(w)Cj(w,t) Rj_reshape = np.reshape(Rj[:, :, ns], (LF * LT, I, I)) Rj_reshape += 1e-16 * np.random.randn(LF * LT, I, I) InvRj = np.zeros((LF * LT, I, I), dtype='single') if I == 1: InvRj = 1 / Rj_reshape elif I == 2: InvDetR = 1 / (Rj_reshape[:, 0, 0] * Rj_reshape[1, 1] - Rj_reshape[0, 1] * Rj_reshape[1, 0]) InvRj[:, 0, 0] = InvDetR * Rj_reshape[:, 1, 1] InvRj[:, 0, 1] = -InvDetR * Rj_reshape[:, 0, 1] InvRj[:, 1, 0] = -InvDetR * Rj_reshape[:, 1, 0] InvRj[:, 1, 1] = InvDetR * Rj_reshape[:, 0, 0] else: InvRj = np.linalg.inv(Rj_reshape) InvRj.shape = (LF * LT, I * I) InvRjCj = np.reshape(InvRj * Cj, (LF * LT, I, I)) zj = np.real(np.matrix.trace(InvRjCj.T) / I) zj = np.mat(zj) # Step (4-d): # Median filter the estimated PSDs # start_time = time.clock() Ktemp = Kj[ns] # Kernel corresponding to the source #j if not FullKernel: # kernel defined as a sliding window (faster method) centerF = (LF - np.mod(LF, 2)) / 2 # middle freq. bin centerT = (LT - np.mod(LT, 2)) / 2 # middle time frame centerTF = np.mat([centerF, centerT]) # middle time-freq. bin KWin = Ktemp.sim(centerTF, TFcoords) # sliding kernel window KWin_reshape = np.reshape(KWin, (LT, LF)).T NZ = np.nonzero(KWin_reshape) # range of element numbers of central nonzero elements KWin_shrink = KWin_reshape[NZ[0].min():NZ[0].max() + 1, NZ[1].min():NZ[1].max() + 1] # extract the central nonzero part ZMed = scipy.ndimage.filters.median_filter(np.reshape(zj, (LT, LF)).T, footprint=KWin_shrink) # median filter fj[:, :, ns] = np.reshape(ZMed.T, (LF * LT, 1)) else: # full kernel method (more general but slower approach) for ft in range(0, LF * LT): simTemp = Ktemp.sim(TFcoords[ft, :], TFcoords) NhoodTemp = np.nonzero(simTemp) zjNhood = np.multiply(zj[NhoodTemp], simTemp[NhoodTemp]) fj[ft, :, ns] = np.median(np.array(zjNhood)) # print time.clock() - start_time, "seconds" # print time.clock() - start_time, "seconds" # Reshape the PSDs fhat = np.zeros((LF, LT, J)) for ns in range(0, J): fhat[:, :, ns] = np.reshape(fj[:, 0, ns], (LT, LF)).T # Reshape the spectrograms Shat = 1j * np.zeros((LF, LT, I, J)) # estimated source STFTs for ns in range(0, J): for nch in range(0, I): Shat[:, :, nch, ns] = np.reshape(S[:, nch, ns], (LT, LF)).T # Compute the inverse stft of the estimated sources shat = np.zeros((x.shape[0], I, J)) sigTemp = AudioSignal() sigTemp.windowtype = Mixture.windowtype sigTemp.windowlength = Mixture.windowlength sigTemp.overlap_samples = Mixture.overlap_samples sigTemp.numCh = I for ns in range(0, J): sigTemp.X = Shat[:, :, :, ns] shat[:, :, ns] = sigTemp.istft()[0][0:x.shape[0]] return shat, fhat