Пример #1
0
    def _build_mwf_output_waveform(self):
        """ Perform separation with multichannel Wiener Filtering using Norbert.
        Note: multichannel Wiener Filtering is not coded in Tensorflow and thus
        may be quite slow.

        :returns: dictionary of separated waveforms (key: instrument name,
            value: estimated waveform of the instrument)
        """
        import norbert  # pylint: disable=import-error
        output_dict = self.model_outputs
        x = self.stft_feature
        v = tf.stack([
            pad_and_reshape(output_dict[f'{instrument}_spectrogram'],
                            self._frame_length, self._F)[:tf.shape(x)[0], ...]
            for instrument in self._instruments
        ],
                     axis=3)
        input_args = [v, x]
        stft_function = tf.py_function(
            lambda v, x: norbert.wiener(v.numpy(), x.numpy()), input_args,
            tf.complex64),
        return {
            instrument: self._inverse_stft(stft_function[0][:, :, :, k])
            for k, instrument in enumerate(self._instruments)
        }
Пример #2
0
def separate(fileIn, fileOut, modelname):
    audio, samplerate = librosa.load(fileIn, sr=22050)
    snips = snipify(audio)
    specs, stfts = SPECify(snips)

    model = autoencoder.loadModel()
    model.load_weights(modelname)
    sourceSpecs = model.predict(specs)
    sourceaudio = np.array([])

    for i in range(0, sourceSpecs.shape[0]):
        sourceSpec = sourceSpecs[i].T
        stft = stfts[i]
        stft = np.expand_dims(stft, axis=2)
        sourceSpec = np.expand_dims(sourceSpec, axis=2)
        sourceSpec = np.expand_dims(sourceSpec, axis=3)
        print(sourceSpec.shape, stft.shape)
        resi = norbert.residual_model(sourceSpec, stft.astype(np.complex128),
                                      1)
        sourceSpecNorbert = norbert.wiener(resi,
                                           stft.astype(np.complex128),
                                           1,
                                           use_softmask=False)
        sourceSpecNorbert1 = sourceSpecNorbert[:, ..., 0, 0]
        sourceaudio = np.append(sourceaudio, librosa.istft(sourceSpecNorbert1))

    soundfile.write(fileOut, sourceaudio, samplerate)
Пример #3
0
def test_shapes(X, V):
    Y = norbert.wiener(V, X)

    assert X.shape == Y.shape[:-1]

    Y = norbert.softmask(V, X)

    assert X.shape == Y.shape[:-1]
Пример #4
0
def test_wiener_copy(X, V):
    X0 = np.copy(X)
    V0 = np.copy(V)

    _ = norbert.wiener(V, X)

    assert np.allclose(X0, X)
    assert np.allclose(V0, V)
Пример #5
0
def test_silent_sources(X, V):
    V[..., :] = 0.0
    Y = norbert.softmask(V, X)

    assert X.shape == Y.shape[:-1]

    Y = norbert.wiener(V, X)
    assert X.shape == Y.shape[:-1]
Пример #6
0
def test_wiener_copy(X, V):
    X0 = X.clone()
    V0 = V.clone()

    _ = norbert.wiener(V, X)

    assert torch.allclose(X0, X)
    assert torch.allclose(V0, V)
Пример #7
0
def test_shapes(V, X):
    Y = norbert.residual(V, X)
    assert X.shape == Y.shape[:-1]

    Y = norbert.wiener(V, X)
    assert X.shape == Y.shape[:-1]

    Y = norbert.softmask(V, X)
    assert X.shape == Y.shape[:-1]
Пример #8
0
def separate_from_audio(audio,
                        rate,
                        mask_model,
                        wiener_filter=True,
                        return_spectrogram=False):
    split_stft, full_stft = preprocess_audio_tf(np.expand_dims(audio, axis=0),
                                                test=True)
    mask = mask_model.predict(split_stft)
    #objective = preprocess(np.array(np.hstack(objective_vocal_samples)),X_mean,X_std)
    mask_in_shape = np.concatenate(mask, axis=1)[:, :, 0]
    input_in_shape = full_stft
    json_path = '../norm_data_full.json'
    with open(json_path) as infile:
        norm_data = json.load(infile)
    X_mean = norm_data['X_min']
    X_std = norm_data['X_max'] - norm_data['X_min']
    test_sample = np.zeros((513, input_in_shape.shape[1]), dtype=complex)
    test_sample[0:513] = full_stft[0:513]
    mask_final = np.zeros((513, test_sample.shape[1]))
    final_mag = np.zeros((513, test_sample.shape[1]))

    mask_final[0:512] = np.concatenate(mask, axis=1)[:, :, 0]
    pre_result = preprocess(test_sample, X_mean, X_std)

    final_mag = denormalize(mask_final, X_mean, X_std)
    result_stft = np.multiply(np.exp(final_mag),
                              np.exp(1j * np.angle(test_sample)))

    audio_vocal_pred = tf.signal.inverse_stft(
        result_stft.T,
        frame_length=1024,
        frame_step=512,
        fft_length=1024,
        window_fn=tf.signal.inverse_stft_window_fn(512)).numpy()
    if wiener_filter:
        test_sample_T = test_sample.T[:, :, np.newaxis]
        result_stft_T = result_stft.T[:, :, np.newaxis, np.newaxis]

        v = norbert.contrib.residual_model(np.abs(result_stft_T),
                                           test_sample_T)
        result_wiener = norbert.wiener(v, test_sample_T, iterations=2)[:, :, :,
                                                                       0]
        result_stft = result_wiener.T.reshape(final_mag.shape[0],
                                              final_mag.shape[1])
        audio_vocal_pred = tf.signal.inverse_stft(
            result_stft.T,
            frame_length=1024,
            frame_step=512,
            fft_length=1024,
            window_fn=tf.signal.inverse_stft_window_fn(512)).numpy()

    if return_spectrogram:
        return result_stft, mask_in_shape, preprocess_tf(
            input_in_shape, X_mean, X_std)
    return audio_vocal_pred[:len(audio)]
Пример #9
0
def PostProcess(Y, stft):
    stft = np.expand_dims(stft, axis=2)
    Y = np.expand_dims(Y.T, axis=3)
    resi = norbert.residual_model(Y, stft.astype(np.complex128), 1)
    YNorbert = norbert.wiener(resi,
                              stft.astype(np.complex128),
                              1,
                              use_softmask=False)
    YNorbert1 = YNorbert[:, ..., 0, 0]
    Yaudio = librosa.istft(YNorbert1)
    return Yaudio
Пример #10
0
def separate(audio,
             targets,
             model_name='umxhq',
             niter=1,
             softmask=False,
             alpha=1,
             residual_model=False,
             device='cpu'):

    # convert numpy audio to torch
    audio_torch = torch.tensor(audio.T[None, ...]).float().to(device)

    source_names = []
    V = []

    for j, target in enumerate(tqdm.tqdm(targets)):
        unmix_target = load_model(target=target,
                                  model_name=model_name,
                                  device=device)
        Vj = unmix_target(audio_torch).cpu().detach().numpy()
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj[:, 0, ...])  # remove sample dim
        source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))

    X = unmix_target.stft(audio_torch).detach().cpu().numpy()
    # convert to complex numpy type
    X = X[..., 0] + X[..., 1] * 1j
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(targets) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += (['residual']
                         if len(targets) > 1 else ['accompaniment'])

    Y = norbert.wiener(V,
                       X.astype(np.complex128),
                       niter,
                       use_softmask=softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(Y[..., j].T,
                          n_fft=unmix_target.stft.n_fft,
                          n_hopsize=unmix_target.stft.n_hop)
        estimates[name] = audio_hat.T

    return estimates
Пример #11
0
def invoke_fast_norbert(filename: str):
    testcase0 = np.load(filename)
    x1, v1 = testcase0['x'], testcase0['v']
    x2, v2 = np.copy(x1), np.copy(v1)

    niter = 1
    use_softmask = False

    y1 = fast_norbert.wiener(v1, x1, niter, use_softmask=use_softmask)
    y2 = norbert.wiener(v2, x2, niter, use_softmask=use_softmask)

    assert y1.shape == y2.shape, f'{y1.shape} == {y2.shape}'
    assert np.allclose(y1, y2), f'{y1.flatten()} == {y2.flatten()}'
Пример #12
0
    def _build_mwf_output_waveform(self, output_dict):

        import norbert
        x = self._features[f'{self._mix_name}_stft']
        v = tf.stack(
            [
                pad_and_reshape(
                    output_dict[f'{instrument}_spectrogram'],
                    self._frame_length,
                    self._F)[:tf.shape(x)[0], ...]
                for instrument in self._instruments
            ],
            axis=3)
        input_args = [v, x]
        stft_function = tf.py_function(
            lambda v, x: norbert.wiener(v.numpy(), x.numpy()),
            input_args,
            tf.complex64),
        return {
            instrument: self._inverse_stft(stft_function[0][:, :, :, k])
            for k, instrument in enumerate(self._instruments)
        }
Пример #13
0
def preprocess_with_norbert(complex_stft_mix, predicted_magnitudes):
    # v: np.ndarray [shape=(nb_frames, nb_bins, {1,nb_channels}, nb_sources)]
    # x: np.ndarray [complex, shape=(nb_frames, nb_bins, nb_channels)]

    complex_stft_mix = complex_stft_mix.detach().data.cpu()
    complex_stft_mix_numpy = np.array(
        complex_stft_mix[:, :, :, :,
                         0]) + 1j * np.array(complex_stft_mix[:, :, :, :, 1])
    complex_stft_mix_numpy = complex_stft_mix_numpy.transpose([0, 3, 2, 1])

    predicted_magnitudes = predicted_magnitudes.detach().data.cpu()
    predicted_magnitudes = np.array(predicted_magnitudes).transpose(3, 2, 0, 1)
    predicted_complex_stft = norbert.wiener(predicted_magnitudes,
                                            complex_stft_mix_numpy[0])
    real, imag = np.real(predicted_complex_stft), np.imag(
        predicted_complex_stft)
    real = real.transpose(2, 3, 1, 0)
    imag = imag.transpose(2, 3, 1, 0)

    torch_predicted_stft = torch.stack(
        (torch.tensor(real), torch.tensor(imag)), dim=4).float()
    return torch_predicted_stft
Пример #14
0
    def run(self):
        source_magnitudes = np.stack(
            [np.abs(e.stft()) for e in self.estimates], axis=-1)
        source_magnitudes = np.transpose(source_magnitudes, (1, 0, 2, 3))
        mix_stft = np.transpose(self.audio_signal.stft(), (1, 0, 2))

        enhanced = norbert.wiener(source_magnitudes,
                                  mix_stft,
                                  iterations=self.iterations,
                                  **self.kwargs)
        _masks = np.abs(enhanced) / np.maximum(1e-7, np.abs(mix_stft[...,
                                                                     None]))
        _masks = np.transpose(_masks, (1, 0, 2, 3))

        self.result_masks = []

        for i in range(_masks.shape[-1]):
            mask_data = _masks[..., i]
            if self.mask_type == self.MASKS['binary']:
                mask_data = _masks[..., i] == np.max(_masks, axis=-1)
            mask = self.mask_type(mask_data)
            self.result_masks.append(mask)

        return self.result_masks
Пример #15
0
def separate(
    audio,
    targets,
    model_name='umxhq',
    niter=1, softmask=False, alpha=1.0,
    residual_model=False, device='cpu'
):
    """
    Performing the separation on audio input

    Parameters
    ----------
    audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)]
        mixture audio

    targets: list of str
        a list of the separation targets.
        Note that for each target a separate model is expected
        to be loaded.

    model_name: str
        name of torchhub model or path to model folder, defaults to `umxhq`

    niter: int
         Number of EM steps for refining initial estimates in a
         post-processing stage, defaults to 1.

    softmask: boolean
        if activated, then the initial estimates for the sources will
        be obtained through a ratio mask of the mixture STFT, and not
        by using the default behavior of reconstructing waveforms
        by using the mixture phase, defaults to False

    alpha: float
        changes the exponent to use for building ratio masks, defaults to 1.0

    residual_model: boolean
        computes a residual target, for custom separation scenarios
        when not all targets are available, defaults to False

    device: str
        set torch device. Defaults to `cpu`.

    Returns
    -------
    estimates: `dict` [`str`, `np.ndarray`]
        dictionary of all restimates as performed by the separation model.

    """
    # convert numpy audio to torch
    audio_torch = torch.tensor(audio.T[None, ...]).float().to(device)

    source_names = []
    V = []

    for j, target in enumerate(tqdm.tqdm(targets)):
        unmix_target = load_model(
            target=target,
            model_name=model_name,
            device=device
        )
        Vj = unmix_target(audio_torch).cpu().detach().numpy()
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj[:, 0, ...])  # remove sample dim
        source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))

    X = unmix_target.stft(audio_torch).detach().cpu().numpy()
    # convert to complex numpy type
    X = X[..., 0] + X[..., 1]*1j
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(targets) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += (['residual'] if len(targets) > 1
                         else ['accompaniment'])

    Y = norbert.wiener(V, X.astype(np.complex128), niter,
                       use_softmask=softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(
            Y[..., j].T,
            n_fft=unmix_target.stft.n_fft,
            n_hopsize=unmix_target.stft.n_hop
        )
        estimates[name] = audio_hat.T

    return estimates
Пример #16
0
def test_wiener(V, X):
    X = (X.shape[-1] * np.ones(X.shape)).astype(np.complex128)
    Y = norbert.wiener(V, X)
    assert np.allclose(Y.sum(-1), X)
Пример #17
0
def predict(dnn_model, device, data, sr, trained_on="vocals"):
    """
    Predicts the estimates of vocals and accompaniment using the model provided.

    Parameters
    ----------
    dnn_model : Generalised_Recurrent_Model
        model to use for prediction
    device : torch.device
        device to use
    data : ndarray(nb_samples, nb_channels)
        data of mixture track in time series
    sr : int
        sampling rate of the mixture track
    trained_on : str
        Labels of the trained model "vocals" or "accompaniment"

    Returns
    -------
    acc_estimate: ndarray, shape(nb_samples, nb_channels)
        Accompaniment estimates in time series
    vocals_estimate: ndarray, shape(nb_samples, nb_channels)
        Vocals estimates in time series
    """
    # transformation object
    transform = STFT(sr=DATASET_CONFIG.SR,
                     n_per_seg=DATASET_CONFIG.N_PER_SEG,
                     n_overlap=DATASET_CONFIG.N_OVERLAP)

    # Scaler object
    scaler = Scaler()

    # convert track to mono track
    if data.shape[1] != 1:
        data = sp.to_mono(data)

    nb_samples, nb_channels = data.shape

    # generate STFT of time series data, shape(nbframes, nb_bins, nb_channels)
    mixture_tf = transform.stft(data.T)

    # get spectrogram of STFT i.e., |Xi|, shape(nbframes, nb_bins, nb_channels)
    mixture_stft = np.abs(mixture_tf)

    # scaling the values to 0 to 1, shape(nbframes, nb_bins, nb_channels)
    X_scaled = scaler.scale(mixture_stft)

    # transposing the matrix to make it in shape (nb_batch, nb_frames, nb_bins)
    X_scaled = np.transpose(X_scaled, (2, 0, 1))

    mixture_tensor = torch.tensor(X_scaled, dtype=torch.float32,
                                  device=device).to(device)
    estimate = dnn_model(mixture_tensor)

    # output tensor shape (nb_batch, nb_frames, nb_bins)
    estimate_np = estimate[0].cpu().detach().numpy()

    # stacking the output to make it in stereo shape
    # and transposing it back to shape (nb_frames, nb_bins, nb_channels)
    estimate_stereo = np.stack([estimate_np, estimate_np]).transpose(1, 2, 0)
    # intensifies the signal
    estimate_stereo = estimate_stereo[..., None]**2

    # stacking the mixture stft to make it in stereo shape
    # and transposing it back to shape (nb_frames, nb_bins, nb_channels)
    mixture_tf_squeeze = np.squeeze(mixture_tf)
    mixture_tf_stereo = np.stack([mixture_tf_squeeze,
                                  mixture_tf_squeeze]).transpose(1, 2, 0)

    # models the estimates to stft, frequency wise.
    estimate_residual = norbert.residual(estimate_stereo, mixture_tf_stereo)
    # applying wiener filers to get the sources
    estimate_filter_results = norbert.wiener(np.copy(estimate_residual),
                                             np.copy(mixture_tf_stereo))

    # return the estimates based on the source type of the labels
    if trained_on == "vocals":
        vocals_estimate = transform.istft(estimate_filter_results[..., 0]).T
        acc_estimate = transform.istft(estimate_filter_results[..., 1]).T
        return acc_estimate, vocals_estimate
    else:
        acc_estimate = transform.istft(estimate_filter_results[..., 0]).T
        vocals_estimate = transform.istft(estimate_filter_results[..., 1]).T
        return acc_estimate, vocals_estimate
Пример #18
0
def separate(
    audio,
    x_umx_target,
    instruments,
    niter=1,
    softmask=False,
    alpha=1.0,
    residual_model=False,
    device="cpu",
):
    """
    Performing the separation on audio input

    Parameters
    ----------
    audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)]
        mixture audio

    x_umx_target: asteroid.models
        X-UMX model used for separating

    instruments: list
        The list of instruments, e.g., ["bass", "drums", "vocals"]

    niter: int
         Number of EM steps for refining initial estimates in a
         post-processing stage, defaults to 1.

    softmask: boolean
        if activated, then the initial estimates for the sources will
        be obtained through a ratio mask of the mixture STFT, and not
        by using the default behavior of reconstructing waveforms
        by using the mixture phase, defaults to False

    alpha: float
        changes the exponent to use for building ratio masks, defaults to 1.0

    residual_model: boolean
        computes a residual target, for custom separation scenarios
        when not all targets are available, defaults to False

    device: str
        set torch device. Defaults to `cpu`.

    Returns
    -------
    estimates: `dict` [`str`, `np.ndarray`]
        dictionary with all estimates obtained by the separation model.
    """

    # convert numpy audio to torch
    audio_torch = torch.tensor(audio.T[None, ...]).float().to(device)

    source_names = []
    V = []

    masked_tf_rep, _ = x_umx_target(audio_torch)
    # shape: (Sources, frames, batch, channels, fbin)

    for j, target in enumerate(instruments):
        Vj = masked_tf_rep[j, Ellipsis].cpu().detach().numpy()
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj[:, 0, Ellipsis])  # remove sample dim
        source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))

    # convert to complex numpy type
    tmp = x_umx_target.encoder(audio_torch)
    X = torch_complex_from_magphase(tmp[0].permute(1, 2, 3, 0), tmp[1])
    X = X.detach().cpu().numpy()
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(instruments) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += ["residual"
                         ] if len(instruments) > 1 else ["accompaniment"]

    Y = norbert.wiener(V,
                       X.astype(np.complex128),
                       niter,
                       use_softmask=softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(
            Y[..., j].T,
            rate=x_umx_target.sample_rate,
            n_fft=x_umx_target.in_chan,
            n_hopsize=x_umx_target.n_hop,
        )
        estimates[name] = audio_hat.T

    return estimates
Пример #19
0
def test_wiener(V, X, nb_iterations):
    X = X.shape[-1] * np.ones(X.shape)
    Y = norbert.wiener(V, X, iterations=nb_iterations)
    assert np.allclose(Y.sum(-1), X)
Пример #20
0
    Vj = []  # holds vocals' spectrograms.
    for i in tqdm.tqdm(range(len(X)), desc='Estimating vocals..'):
        Vj.append(model(X[i]))
    Vj = torch.cat(Vj, dim=3).cpu().detach().numpy()

# Prepare input for MWF.
print('Calculating MWF..')
V_vox = np.transpose(Vj, [3, 0, 1, 2])
V.append(V_vox[:, 0, ...])  # remove sample dim
V = np.transpose(np.array(V), (1, 3, 2, 0))

X = model.mdensenet.stft(audio).detach().cpu().numpy()
X = X[..., 0] + X[..., 1] * 1j
X = X[0].transpose(2, 1, 0)
V = norbert.residual_model(V, X, 1)
Y = norbert.wiener(V, X.astype(np.complex128), 1, use_softmask=False)

# Extract source estimates in time domain.
s = []
estimates = {}
for j in range(Y.shape[-1]):
    audio_hat = istft(Y[..., j].T, n_fft=n_fft, n_hop=n_hop, sr=sr)
    s.append(audio_hat.T)

end_time = time.time()
print(f'Separation duration: {end_time - start_time:.2f} sec.')

print('Saving track..')
out_name = Path(args.out_name).expanduser()
out_name.parent.mkdir(parents=True, exist_ok=True)
Пример #21
0
def separate(audio, args):
    """
    Performing the separation on audio input
    Parameters
    ----------
    audio: np.ndarray [shape=(nb_timesteps, nb_channels)]
        mixture audio
    args : ArgumentParser
        ArgumentParser for OpenUnmix_CrossNet(X-UMX)/OpenUnmix(UMX) Inference

    Returns
    -------
    estimates: `dict` [`str`, `np.ndarray`]
        dictionary of all estimates as performed by the separation model.
    """

    # convert numpy audio to NNabla Variable
    audio_nn = nn.Variable.from_numpy_array(audio.T[None, ...])
    source_names = []
    V = []
    max_bin = bandwidth_to_max_bin(sample_rate=44100,
                                   n_fft=4096,
                                   bandwidth=16000)

    if not args.umx_infer:
        # Run X-UMX Inference
        nn.load_parameters(args.model)
        for j, target in enumerate(args.targets):
            if j == 0:
                unmix_target = model.OpenUnmix_CrossNet(max_bin=max_bin,
                                                        is_predict=True)
                mix_spec, msk, _ = unmix_target(audio_nn, test=True)
                # Network output is (nb_frames, nb_samples, nb_channels, nb_bins)
            V.append((msk[Ellipsis, j * 2:j * 2 + 2, :] * mix_spec).d[:, 0,
                                                                      ...])
            source_names += [target]
    else:
        # Run UMX Inference
        for j, target in enumerate(args.targets):
            with nn.parameter_scope(target):
                unmix_target = model.OpenUnmix(max_bin=max_bin)
                nn.load_parameters(f"{os.path.join(args.model, target)}.h5")
                # Network output is (nb_frames, nb_samples, nb_channels, nb_bins)
                V.append(unmix_target(audio_nn, test=True).d[:, 0, ...])
            source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))
    if args.softmask:
        # only exponentiate the model if we use softmask
        V = V**args.alpha

    real, imag = model.get_stft(audio_nn, center=True)

    # convert to complex numpy type
    X = real.d + imag.d * 1j
    X = X[0].transpose(2, 1, 0)

    if args.residual_model or len(args.targets) == 1:
        V = norbert.residual_model(V, X, args.alpha if args.softmask else 1)
        source_names += (['residual']
                         if len(args.targets) > 1 else ['accompaniment'])

    Y = norbert.wiener(V,
                       X.astype(np.complex128),
                       args.niter,
                       use_softmask=args.softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(Y[..., j].T,
                          n_fft=unmix_target.n_fft,
                          n_hopsize=unmix_target.n_hop)
        estimates[name] = audio_hat.T

    return estimates
Пример #22
0
def separate(audio,
             targets,
             model_name='umxhq',
             niter=1,
             softmask=False,
             alpha=1.0,
             residual_model=False,
             device='cpu'):
    """
    Performing the separation on audio input

    Parameters
    ----------
    audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)]
        mixture audio

    targets: list of str
        a list of the separation targets.
        Note that for each target a separate model is expected
        to be loaded.

    model_name: str
        name of torchhub model or path to model folder, defaults to `umxhq`

    niter: int
         Number of EM steps for refining initial estimates in a
         post-processing stage, defaults to 1.

    softmask: boolean
        if activated, then the initial estimates for the sources will
        be obtained through a ratio mask of the mixture STFT, and not
        by using the default behavior of reconstructing waveforms
        by using the mixture phase, defaults to False

    alpha: float
        changes the exponent to use for building ratio masks, defaults to 1.0

    residual_model: boolean
        computes a residual target, for custom separation scenarios
        when not all targets are available, defaults to False

    device: str
        set torch device. Defaults to `cpu`.

    Returns
    -------
    estimates: `dict` [`str`, `np.ndarray`]
        dictionary of all restimates as performed by the separation model.

    """
    # convert numpy audio to torch
    print('loading audio')
    audio_torch = torch.tensor(audio.T[None, ...]).float().to(device)
    print('audio loaded')
    source_names = targets
    unmix = load_model(targets=targets, model_name=model_name, device=device)
    print('model loaded')
    # Obtain the mask from the model
    V = unmix(audio_torch)
    print('separation obtained')
    X = unmix.stft(audio_torch).permute(3, 0, 1, 2, 4)
    # Apply the mask
    mag = torchaudio.functional.complex_norm(X)
    V = [Y_hat * mag for Y_hat in V]
    # From torch to numpy complex, for norbert EM algorithm
    V = np.array([m.cpu().detach().numpy() for m in V])[:, :, 0, :, :]
    V = V.transpose(1, 3, 2, 0)
    X = X.detach().cpu().numpy()[:, 0, :, :]
    X = X[..., 0] + X[..., 1] * 1j
    X = X.transpose(0, 2, 1)
    print('pre-norbert OK')
    # Apply norbert Wiener Filter
    Y_EM = norbert.wiener(V,
                          X.astype(np.complex128),
                          niter,
                          use_softmask=softmask)
    print('norbert OK')
    # back to torch complex for torchaudio ISTFT:
    Y_hats = torch.stack(
        [torch.from_numpy(np.real(Y_EM)),
         torch.from_numpy(np.imag(Y_EM))]).permute(1, 4, 3, 2, 0)
    Y_hats = Y_hats.float().unsqueeze(2).unbind(1)
    y_hats = [unmix.istft(spec, audio_torch.shape[-1]) for spec in Y_hats]
    # back to numpy for BSSeval
    y_hats = [y_hat.cpu().detach().numpy() for y_hat in y_hats]
    print('numpy OK')
    estimates = {}
    for j, name in enumerate(source_names):
        estimates[name] = y_hats[j][
            0].T  #final estimate should be [length,2] and float64
    return estimates
Пример #23
0
def separate(
    audio,
    model_path='models/x-umx.h5',
    niter=1,
    softmask=False,
    alpha=1.0,
    residual_model=False
):
    """
    Performing the separation on audio input
    Parameters
    ----------
    audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)]
        mixture audio
    model_path: str
        path to model folder, defaults to `models/`
    niter: int
         Number of EM steps for refining initial estimates in a
         post-processing stage, defaults to 1.
    softmask: boolean
        if activated, then the initial estimates for the sources will
        be obtained through a ratio mask of the mixture STFT, and not
        by using the default behavior of reconstructing waveforms
        by using the mixture phase, defaults to False
    alpha: float
        changes the exponent to use for building ratio masks, defaults to 1.0
    residual_model: boolean
        computes a residual target, for custom separation scenarios
        when not all targets are available, defaults to False
    Returns
    -------
    estimates: `dict` [`str`, `np.ndarray`]
        dictionary of all restimates as performed by the separation model.
    """
    # convert numpy audio to NNabla Variable
    audio_nn = nn.Variable.from_numpy_array(audio.T[None, ...])
    source_names = []
    V = []

    sources = ['bass', 'drums', 'vocals', 'other']
    for j, target in enumerate(sources):
        if j == 0:
            unmix_target = model.OpenUnmix_CrossNet(max_bin=1487)
            unmix_target.is_predict = True
            nn.load_parameters(model_path)
            mix_spec, msk, _ = unmix_target(audio_nn, test=True)
        Vj = msk[Ellipsis, j*2:j*2+2, :] * mix_spec
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj.d[:, 0, ...])  # remove sample dim
        source_names += [target]
    V = np.transpose(np.array(V), (1, 3, 2, 0))

    real, imag = model.STFT(audio_nn, center=True)

    # convert to complex numpy type
    X = real.d + imag.d*1j
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(sources) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += (['residual'] if len(sources) > 1
                         else ['accompaniment'])

    Y = norbert.wiener(V, X.astype(np.complex128), niter,
                       use_softmask=softmask)

    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(
            Y[..., j].T,
            n_fft=unmix_target.n_fft,
            n_hopsize=unmix_target.n_hop
        )
        estimates[name] = audio_hat.T

    return estimates
Пример #24
0
def separate(input_path,
             output_path,
             model_name='umxhq',
             targets=('vocals', 'drums', 'bass', 'other'),
             samplerate=44100,
             device='cpu',
             softmask=False,
             residual_model=False,
             alpha=1.0,
             niter=1):
    """
    generate 4 subtargets
    """

    # ENTREE : input path
    # SORTIE : OUTPUT PATH NOM DE DOSSIER ECRIT LES SUBTARGETS EN .WAV DANS CE PATH

    # handling an input audio path
    audio, rate = sf.read(
        input_path,
        always_2d=True,
    )

    if audio.shape[1] > 2:
        warnings.warn('Channel count > 2! '
                      'Only the first two channels will be processed!')
        audio = audio[:, :2]

    if rate != samplerate:
        # resample to model samplerate if needed
        audio = resampy.resample(audio, rate, samplerate, axis=0)

    if audio.shape[1] == 1:
        # if we have mono, let's duplicate it
        # as the input of OpenUnmix is always stereo
        audio = np.repeat(audio, 2, axis=1)
    # convert numpy audio to torch
    audio_torch = torch.tensor(audio.T[None, ...]).float().to(device)

    source_names = []
    V = []

    for j, target in enumerate(tqdm.tqdm(targets)):
        unmix_target = load_model(target=target,
                                  model_name=model_name,
                                  device=device)
        Vj = unmix_target(audio_torch).cpu().detach().numpy()
        if softmask:
            # only exponentiate the model if we use softmask
            Vj = Vj**alpha
        # output is nb_frames, nb_samples, nb_channels, nb_bins
        V.append(Vj[:, 0, ...])  # remove sample dim
        source_names += [target]

    V = np.transpose(np.array(V), (1, 3, 2, 0))

    X = unmix_target.stft(audio_torch).detach().cpu().numpy()
    # convert to complex numpy type
    X = X[..., 0] + X[..., 1] * 1j
    X = X[0].transpose(2, 1, 0)

    if residual_model or len(targets) == 1:
        V = norbert.residual_model(V, X, alpha if softmask else 1)
        source_names += (['residual']
                         if len(targets) > 1 else ['accompaniment'])

    Y = norbert.wiener(V,
                       X.astype(np.complex128),
                       niter,
                       use_softmask=softmask)

    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    estimates = {}
    for j, name in enumerate(source_names):
        audio_hat = istft(Y[..., j].T,
                          n_fft=unmix_target.stft.n_fft,
                          n_hopsize=unmix_target.stft.n_hop)
        estimates[name] = audio_hat.T

        # write wav file in output_path
        subtarget_path = output_path.joinpath(name + '.wav')
        sf.write(subtarget_path, estimates[name], samplerate)
    return estimates
Пример #25
0
def test_wiener(V, X):
    X = (X.shape[-1] * torch.ones(X.shape)).to(torch.complex128)
    Y = norbert.wiener(V, X)
    assert torch.allclose(Y.sum(-1), X)
    Y.sum().backward()