def _build_mwf_output_waveform(self): """ Perform separation with multichannel Wiener Filtering using Norbert. Note: multichannel Wiener Filtering is not coded in Tensorflow and thus may be quite slow. :returns: dictionary of separated waveforms (key: instrument name, value: estimated waveform of the instrument) """ import norbert # pylint: disable=import-error output_dict = self.model_outputs x = self.stft_feature v = tf.stack([ pad_and_reshape(output_dict[f'{instrument}_spectrogram'], self._frame_length, self._F)[:tf.shape(x)[0], ...] for instrument in self._instruments ], axis=3) input_args = [v, x] stft_function = tf.py_function( lambda v, x: norbert.wiener(v.numpy(), x.numpy()), input_args, tf.complex64), return { instrument: self._inverse_stft(stft_function[0][:, :, :, k]) for k, instrument in enumerate(self._instruments) }
def separate(fileIn, fileOut, modelname): audio, samplerate = librosa.load(fileIn, sr=22050) snips = snipify(audio) specs, stfts = SPECify(snips) model = autoencoder.loadModel() model.load_weights(modelname) sourceSpecs = model.predict(specs) sourceaudio = np.array([]) for i in range(0, sourceSpecs.shape[0]): sourceSpec = sourceSpecs[i].T stft = stfts[i] stft = np.expand_dims(stft, axis=2) sourceSpec = np.expand_dims(sourceSpec, axis=2) sourceSpec = np.expand_dims(sourceSpec, axis=3) print(sourceSpec.shape, stft.shape) resi = norbert.residual_model(sourceSpec, stft.astype(np.complex128), 1) sourceSpecNorbert = norbert.wiener(resi, stft.astype(np.complex128), 1, use_softmask=False) sourceSpecNorbert1 = sourceSpecNorbert[:, ..., 0, 0] sourceaudio = np.append(sourceaudio, librosa.istft(sourceSpecNorbert1)) soundfile.write(fileOut, sourceaudio, samplerate)
def test_shapes(X, V): Y = norbert.wiener(V, X) assert X.shape == Y.shape[:-1] Y = norbert.softmask(V, X) assert X.shape == Y.shape[:-1]
def test_wiener_copy(X, V): X0 = np.copy(X) V0 = np.copy(V) _ = norbert.wiener(V, X) assert np.allclose(X0, X) assert np.allclose(V0, V)
def test_silent_sources(X, V): V[..., :] = 0.0 Y = norbert.softmask(V, X) assert X.shape == Y.shape[:-1] Y = norbert.wiener(V, X) assert X.shape == Y.shape[:-1]
def test_wiener_copy(X, V): X0 = X.clone() V0 = V.clone() _ = norbert.wiener(V, X) assert torch.allclose(X0, X) assert torch.allclose(V0, V)
def test_shapes(V, X): Y = norbert.residual(V, X) assert X.shape == Y.shape[:-1] Y = norbert.wiener(V, X) assert X.shape == Y.shape[:-1] Y = norbert.softmask(V, X) assert X.shape == Y.shape[:-1]
def separate_from_audio(audio, rate, mask_model, wiener_filter=True, return_spectrogram=False): split_stft, full_stft = preprocess_audio_tf(np.expand_dims(audio, axis=0), test=True) mask = mask_model.predict(split_stft) #objective = preprocess(np.array(np.hstack(objective_vocal_samples)),X_mean,X_std) mask_in_shape = np.concatenate(mask, axis=1)[:, :, 0] input_in_shape = full_stft json_path = '../norm_data_full.json' with open(json_path) as infile: norm_data = json.load(infile) X_mean = norm_data['X_min'] X_std = norm_data['X_max'] - norm_data['X_min'] test_sample = np.zeros((513, input_in_shape.shape[1]), dtype=complex) test_sample[0:513] = full_stft[0:513] mask_final = np.zeros((513, test_sample.shape[1])) final_mag = np.zeros((513, test_sample.shape[1])) mask_final[0:512] = np.concatenate(mask, axis=1)[:, :, 0] pre_result = preprocess(test_sample, X_mean, X_std) final_mag = denormalize(mask_final, X_mean, X_std) result_stft = np.multiply(np.exp(final_mag), np.exp(1j * np.angle(test_sample))) audio_vocal_pred = tf.signal.inverse_stft( result_stft.T, frame_length=1024, frame_step=512, fft_length=1024, window_fn=tf.signal.inverse_stft_window_fn(512)).numpy() if wiener_filter: test_sample_T = test_sample.T[:, :, np.newaxis] result_stft_T = result_stft.T[:, :, np.newaxis, np.newaxis] v = norbert.contrib.residual_model(np.abs(result_stft_T), test_sample_T) result_wiener = norbert.wiener(v, test_sample_T, iterations=2)[:, :, :, 0] result_stft = result_wiener.T.reshape(final_mag.shape[0], final_mag.shape[1]) audio_vocal_pred = tf.signal.inverse_stft( result_stft.T, frame_length=1024, frame_step=512, fft_length=1024, window_fn=tf.signal.inverse_stft_window_fn(512)).numpy() if return_spectrogram: return result_stft, mask_in_shape, preprocess_tf( input_in_shape, X_mean, X_std) return audio_vocal_pred[:len(audio)]
def PostProcess(Y, stft): stft = np.expand_dims(stft, axis=2) Y = np.expand_dims(Y.T, axis=3) resi = norbert.residual_model(Y, stft.astype(np.complex128), 1) YNorbert = norbert.wiener(resi, stft.astype(np.complex128), 1, use_softmask=False) YNorbert1 = YNorbert[:, ..., 0, 0] Yaudio = librosa.istft(YNorbert1) return Yaudio
def separate(audio, targets, model_name='umxhq', niter=1, softmask=False, alpha=1, residual_model=False, device='cpu'): # convert numpy audio to torch audio_torch = torch.tensor(audio.T[None, ...]).float().to(device) source_names = [] V = [] for j, target in enumerate(tqdm.tqdm(targets)): unmix_target = load_model(target=target, model_name=model_name, device=device) Vj = unmix_target(audio_torch).cpu().detach().numpy() if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj[:, 0, ...]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) X = unmix_target.stft(audio_torch).detach().cpu().numpy() # convert to complex numpy type X = X[..., 0] + X[..., 1] * 1j X = X[0].transpose(2, 1, 0) if residual_model or len(targets) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += (['residual'] if len(targets) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft(Y[..., j].T, n_fft=unmix_target.stft.n_fft, n_hopsize=unmix_target.stft.n_hop) estimates[name] = audio_hat.T return estimates
def invoke_fast_norbert(filename: str): testcase0 = np.load(filename) x1, v1 = testcase0['x'], testcase0['v'] x2, v2 = np.copy(x1), np.copy(v1) niter = 1 use_softmask = False y1 = fast_norbert.wiener(v1, x1, niter, use_softmask=use_softmask) y2 = norbert.wiener(v2, x2, niter, use_softmask=use_softmask) assert y1.shape == y2.shape, f'{y1.shape} == {y2.shape}' assert np.allclose(y1, y2), f'{y1.flatten()} == {y2.flatten()}'
def _build_mwf_output_waveform(self, output_dict): import norbert x = self._features[f'{self._mix_name}_stft'] v = tf.stack( [ pad_and_reshape( output_dict[f'{instrument}_spectrogram'], self._frame_length, self._F)[:tf.shape(x)[0], ...] for instrument in self._instruments ], axis=3) input_args = [v, x] stft_function = tf.py_function( lambda v, x: norbert.wiener(v.numpy(), x.numpy()), input_args, tf.complex64), return { instrument: self._inverse_stft(stft_function[0][:, :, :, k]) for k, instrument in enumerate(self._instruments) }
def preprocess_with_norbert(complex_stft_mix, predicted_magnitudes): # v: np.ndarray [shape=(nb_frames, nb_bins, {1,nb_channels}, nb_sources)] # x: np.ndarray [complex, shape=(nb_frames, nb_bins, nb_channels)] complex_stft_mix = complex_stft_mix.detach().data.cpu() complex_stft_mix_numpy = np.array( complex_stft_mix[:, :, :, :, 0]) + 1j * np.array(complex_stft_mix[:, :, :, :, 1]) complex_stft_mix_numpy = complex_stft_mix_numpy.transpose([0, 3, 2, 1]) predicted_magnitudes = predicted_magnitudes.detach().data.cpu() predicted_magnitudes = np.array(predicted_magnitudes).transpose(3, 2, 0, 1) predicted_complex_stft = norbert.wiener(predicted_magnitudes, complex_stft_mix_numpy[0]) real, imag = np.real(predicted_complex_stft), np.imag( predicted_complex_stft) real = real.transpose(2, 3, 1, 0) imag = imag.transpose(2, 3, 1, 0) torch_predicted_stft = torch.stack( (torch.tensor(real), torch.tensor(imag)), dim=4).float() return torch_predicted_stft
def run(self): source_magnitudes = np.stack( [np.abs(e.stft()) for e in self.estimates], axis=-1) source_magnitudes = np.transpose(source_magnitudes, (1, 0, 2, 3)) mix_stft = np.transpose(self.audio_signal.stft(), (1, 0, 2)) enhanced = norbert.wiener(source_magnitudes, mix_stft, iterations=self.iterations, **self.kwargs) _masks = np.abs(enhanced) / np.maximum(1e-7, np.abs(mix_stft[..., None])) _masks = np.transpose(_masks, (1, 0, 2, 3)) self.result_masks = [] for i in range(_masks.shape[-1]): mask_data = _masks[..., i] if self.mask_type == self.MASKS['binary']: mask_data = _masks[..., i] == np.max(_masks, axis=-1) mask = self.mask_type(mask_data) self.result_masks.append(mask) return self.result_masks
def separate( audio, targets, model_name='umxhq', niter=1, softmask=False, alpha=1.0, residual_model=False, device='cpu' ): """ Performing the separation on audio input Parameters ---------- audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)] mixture audio targets: list of str a list of the separation targets. Note that for each target a separate model is expected to be loaded. model_name: str name of torchhub model or path to model folder, defaults to `umxhq` niter: int Number of EM steps for refining initial estimates in a post-processing stage, defaults to 1. softmask: boolean if activated, then the initial estimates for the sources will be obtained through a ratio mask of the mixture STFT, and not by using the default behavior of reconstructing waveforms by using the mixture phase, defaults to False alpha: float changes the exponent to use for building ratio masks, defaults to 1.0 residual_model: boolean computes a residual target, for custom separation scenarios when not all targets are available, defaults to False device: str set torch device. Defaults to `cpu`. Returns ------- estimates: `dict` [`str`, `np.ndarray`] dictionary of all restimates as performed by the separation model. """ # convert numpy audio to torch audio_torch = torch.tensor(audio.T[None, ...]).float().to(device) source_names = [] V = [] for j, target in enumerate(tqdm.tqdm(targets)): unmix_target = load_model( target=target, model_name=model_name, device=device ) Vj = unmix_target(audio_torch).cpu().detach().numpy() if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj[:, 0, ...]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) X = unmix_target.stft(audio_torch).detach().cpu().numpy() # convert to complex numpy type X = X[..., 0] + X[..., 1]*1j X = X[0].transpose(2, 1, 0) if residual_model or len(targets) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += (['residual'] if len(targets) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft( Y[..., j].T, n_fft=unmix_target.stft.n_fft, n_hopsize=unmix_target.stft.n_hop ) estimates[name] = audio_hat.T return estimates
def test_wiener(V, X): X = (X.shape[-1] * np.ones(X.shape)).astype(np.complex128) Y = norbert.wiener(V, X) assert np.allclose(Y.sum(-1), X)
def predict(dnn_model, device, data, sr, trained_on="vocals"): """ Predicts the estimates of vocals and accompaniment using the model provided. Parameters ---------- dnn_model : Generalised_Recurrent_Model model to use for prediction device : torch.device device to use data : ndarray(nb_samples, nb_channels) data of mixture track in time series sr : int sampling rate of the mixture track trained_on : str Labels of the trained model "vocals" or "accompaniment" Returns ------- acc_estimate: ndarray, shape(nb_samples, nb_channels) Accompaniment estimates in time series vocals_estimate: ndarray, shape(nb_samples, nb_channels) Vocals estimates in time series """ # transformation object transform = STFT(sr=DATASET_CONFIG.SR, n_per_seg=DATASET_CONFIG.N_PER_SEG, n_overlap=DATASET_CONFIG.N_OVERLAP) # Scaler object scaler = Scaler() # convert track to mono track if data.shape[1] != 1: data = sp.to_mono(data) nb_samples, nb_channels = data.shape # generate STFT of time series data, shape(nbframes, nb_bins, nb_channels) mixture_tf = transform.stft(data.T) # get spectrogram of STFT i.e., |Xi|, shape(nbframes, nb_bins, nb_channels) mixture_stft = np.abs(mixture_tf) # scaling the values to 0 to 1, shape(nbframes, nb_bins, nb_channels) X_scaled = scaler.scale(mixture_stft) # transposing the matrix to make it in shape (nb_batch, nb_frames, nb_bins) X_scaled = np.transpose(X_scaled, (2, 0, 1)) mixture_tensor = torch.tensor(X_scaled, dtype=torch.float32, device=device).to(device) estimate = dnn_model(mixture_tensor) # output tensor shape (nb_batch, nb_frames, nb_bins) estimate_np = estimate[0].cpu().detach().numpy() # stacking the output to make it in stereo shape # and transposing it back to shape (nb_frames, nb_bins, nb_channels) estimate_stereo = np.stack([estimate_np, estimate_np]).transpose(1, 2, 0) # intensifies the signal estimate_stereo = estimate_stereo[..., None]**2 # stacking the mixture stft to make it in stereo shape # and transposing it back to shape (nb_frames, nb_bins, nb_channels) mixture_tf_squeeze = np.squeeze(mixture_tf) mixture_tf_stereo = np.stack([mixture_tf_squeeze, mixture_tf_squeeze]).transpose(1, 2, 0) # models the estimates to stft, frequency wise. estimate_residual = norbert.residual(estimate_stereo, mixture_tf_stereo) # applying wiener filers to get the sources estimate_filter_results = norbert.wiener(np.copy(estimate_residual), np.copy(mixture_tf_stereo)) # return the estimates based on the source type of the labels if trained_on == "vocals": vocals_estimate = transform.istft(estimate_filter_results[..., 0]).T acc_estimate = transform.istft(estimate_filter_results[..., 1]).T return acc_estimate, vocals_estimate else: acc_estimate = transform.istft(estimate_filter_results[..., 0]).T vocals_estimate = transform.istft(estimate_filter_results[..., 1]).T return acc_estimate, vocals_estimate
def separate( audio, x_umx_target, instruments, niter=1, softmask=False, alpha=1.0, residual_model=False, device="cpu", ): """ Performing the separation on audio input Parameters ---------- audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)] mixture audio x_umx_target: asteroid.models X-UMX model used for separating instruments: list The list of instruments, e.g., ["bass", "drums", "vocals"] niter: int Number of EM steps for refining initial estimates in a post-processing stage, defaults to 1. softmask: boolean if activated, then the initial estimates for the sources will be obtained through a ratio mask of the mixture STFT, and not by using the default behavior of reconstructing waveforms by using the mixture phase, defaults to False alpha: float changes the exponent to use for building ratio masks, defaults to 1.0 residual_model: boolean computes a residual target, for custom separation scenarios when not all targets are available, defaults to False device: str set torch device. Defaults to `cpu`. Returns ------- estimates: `dict` [`str`, `np.ndarray`] dictionary with all estimates obtained by the separation model. """ # convert numpy audio to torch audio_torch = torch.tensor(audio.T[None, ...]).float().to(device) source_names = [] V = [] masked_tf_rep, _ = x_umx_target(audio_torch) # shape: (Sources, frames, batch, channels, fbin) for j, target in enumerate(instruments): Vj = masked_tf_rep[j, Ellipsis].cpu().detach().numpy() if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj[:, 0, Ellipsis]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) # convert to complex numpy type tmp = x_umx_target.encoder(audio_torch) X = torch_complex_from_magphase(tmp[0].permute(1, 2, 3, 0), tmp[1]) X = X.detach().cpu().numpy() X = X[0].transpose(2, 1, 0) if residual_model or len(instruments) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += ["residual" ] if len(instruments) > 1 else ["accompaniment"] Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft( Y[..., j].T, rate=x_umx_target.sample_rate, n_fft=x_umx_target.in_chan, n_hopsize=x_umx_target.n_hop, ) estimates[name] = audio_hat.T return estimates
def test_wiener(V, X, nb_iterations): X = X.shape[-1] * np.ones(X.shape) Y = norbert.wiener(V, X, iterations=nb_iterations) assert np.allclose(Y.sum(-1), X)
Vj = [] # holds vocals' spectrograms. for i in tqdm.tqdm(range(len(X)), desc='Estimating vocals..'): Vj.append(model(X[i])) Vj = torch.cat(Vj, dim=3).cpu().detach().numpy() # Prepare input for MWF. print('Calculating MWF..') V_vox = np.transpose(Vj, [3, 0, 1, 2]) V.append(V_vox[:, 0, ...]) # remove sample dim V = np.transpose(np.array(V), (1, 3, 2, 0)) X = model.mdensenet.stft(audio).detach().cpu().numpy() X = X[..., 0] + X[..., 1] * 1j X = X[0].transpose(2, 1, 0) V = norbert.residual_model(V, X, 1) Y = norbert.wiener(V, X.astype(np.complex128), 1, use_softmask=False) # Extract source estimates in time domain. s = [] estimates = {} for j in range(Y.shape[-1]): audio_hat = istft(Y[..., j].T, n_fft=n_fft, n_hop=n_hop, sr=sr) s.append(audio_hat.T) end_time = time.time() print(f'Separation duration: {end_time - start_time:.2f} sec.') print('Saving track..') out_name = Path(args.out_name).expanduser() out_name.parent.mkdir(parents=True, exist_ok=True)
def separate(audio, args): """ Performing the separation on audio input Parameters ---------- audio: np.ndarray [shape=(nb_timesteps, nb_channels)] mixture audio args : ArgumentParser ArgumentParser for OpenUnmix_CrossNet(X-UMX)/OpenUnmix(UMX) Inference Returns ------- estimates: `dict` [`str`, `np.ndarray`] dictionary of all estimates as performed by the separation model. """ # convert numpy audio to NNabla Variable audio_nn = nn.Variable.from_numpy_array(audio.T[None, ...]) source_names = [] V = [] max_bin = bandwidth_to_max_bin(sample_rate=44100, n_fft=4096, bandwidth=16000) if not args.umx_infer: # Run X-UMX Inference nn.load_parameters(args.model) for j, target in enumerate(args.targets): if j == 0: unmix_target = model.OpenUnmix_CrossNet(max_bin=max_bin, is_predict=True) mix_spec, msk, _ = unmix_target(audio_nn, test=True) # Network output is (nb_frames, nb_samples, nb_channels, nb_bins) V.append((msk[Ellipsis, j * 2:j * 2 + 2, :] * mix_spec).d[:, 0, ...]) source_names += [target] else: # Run UMX Inference for j, target in enumerate(args.targets): with nn.parameter_scope(target): unmix_target = model.OpenUnmix(max_bin=max_bin) nn.load_parameters(f"{os.path.join(args.model, target)}.h5") # Network output is (nb_frames, nb_samples, nb_channels, nb_bins) V.append(unmix_target(audio_nn, test=True).d[:, 0, ...]) source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) if args.softmask: # only exponentiate the model if we use softmask V = V**args.alpha real, imag = model.get_stft(audio_nn, center=True) # convert to complex numpy type X = real.d + imag.d * 1j X = X[0].transpose(2, 1, 0) if args.residual_model or len(args.targets) == 1: V = norbert.residual_model(V, X, args.alpha if args.softmask else 1) source_names += (['residual'] if len(args.targets) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), args.niter, use_softmask=args.softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft(Y[..., j].T, n_fft=unmix_target.n_fft, n_hopsize=unmix_target.n_hop) estimates[name] = audio_hat.T return estimates
def separate(audio, targets, model_name='umxhq', niter=1, softmask=False, alpha=1.0, residual_model=False, device='cpu'): """ Performing the separation on audio input Parameters ---------- audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)] mixture audio targets: list of str a list of the separation targets. Note that for each target a separate model is expected to be loaded. model_name: str name of torchhub model or path to model folder, defaults to `umxhq` niter: int Number of EM steps for refining initial estimates in a post-processing stage, defaults to 1. softmask: boolean if activated, then the initial estimates for the sources will be obtained through a ratio mask of the mixture STFT, and not by using the default behavior of reconstructing waveforms by using the mixture phase, defaults to False alpha: float changes the exponent to use for building ratio masks, defaults to 1.0 residual_model: boolean computes a residual target, for custom separation scenarios when not all targets are available, defaults to False device: str set torch device. Defaults to `cpu`. Returns ------- estimates: `dict` [`str`, `np.ndarray`] dictionary of all restimates as performed by the separation model. """ # convert numpy audio to torch print('loading audio') audio_torch = torch.tensor(audio.T[None, ...]).float().to(device) print('audio loaded') source_names = targets unmix = load_model(targets=targets, model_name=model_name, device=device) print('model loaded') # Obtain the mask from the model V = unmix(audio_torch) print('separation obtained') X = unmix.stft(audio_torch).permute(3, 0, 1, 2, 4) # Apply the mask mag = torchaudio.functional.complex_norm(X) V = [Y_hat * mag for Y_hat in V] # From torch to numpy complex, for norbert EM algorithm V = np.array([m.cpu().detach().numpy() for m in V])[:, :, 0, :, :] V = V.transpose(1, 3, 2, 0) X = X.detach().cpu().numpy()[:, 0, :, :] X = X[..., 0] + X[..., 1] * 1j X = X.transpose(0, 2, 1) print('pre-norbert OK') # Apply norbert Wiener Filter Y_EM = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) print('norbert OK') # back to torch complex for torchaudio ISTFT: Y_hats = torch.stack( [torch.from_numpy(np.real(Y_EM)), torch.from_numpy(np.imag(Y_EM))]).permute(1, 4, 3, 2, 0) Y_hats = Y_hats.float().unsqueeze(2).unbind(1) y_hats = [unmix.istft(spec, audio_torch.shape[-1]) for spec in Y_hats] # back to numpy for BSSeval y_hats = [y_hat.cpu().detach().numpy() for y_hat in y_hats] print('numpy OK') estimates = {} for j, name in enumerate(source_names): estimates[name] = y_hats[j][ 0].T #final estimate should be [length,2] and float64 return estimates
def separate( audio, model_path='models/x-umx.h5', niter=1, softmask=False, alpha=1.0, residual_model=False ): """ Performing the separation on audio input Parameters ---------- audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)] mixture audio model_path: str path to model folder, defaults to `models/` niter: int Number of EM steps for refining initial estimates in a post-processing stage, defaults to 1. softmask: boolean if activated, then the initial estimates for the sources will be obtained through a ratio mask of the mixture STFT, and not by using the default behavior of reconstructing waveforms by using the mixture phase, defaults to False alpha: float changes the exponent to use for building ratio masks, defaults to 1.0 residual_model: boolean computes a residual target, for custom separation scenarios when not all targets are available, defaults to False Returns ------- estimates: `dict` [`str`, `np.ndarray`] dictionary of all restimates as performed by the separation model. """ # convert numpy audio to NNabla Variable audio_nn = nn.Variable.from_numpy_array(audio.T[None, ...]) source_names = [] V = [] sources = ['bass', 'drums', 'vocals', 'other'] for j, target in enumerate(sources): if j == 0: unmix_target = model.OpenUnmix_CrossNet(max_bin=1487) unmix_target.is_predict = True nn.load_parameters(model_path) mix_spec, msk, _ = unmix_target(audio_nn, test=True) Vj = msk[Ellipsis, j*2:j*2+2, :] * mix_spec if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj.d[:, 0, ...]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) real, imag = model.STFT(audio_nn, center=True) # convert to complex numpy type X = real.d + imag.d*1j X = X[0].transpose(2, 1, 0) if residual_model or len(sources) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += (['residual'] if len(sources) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft( Y[..., j].T, n_fft=unmix_target.n_fft, n_hopsize=unmix_target.n_hop ) estimates[name] = audio_hat.T return estimates
def separate(input_path, output_path, model_name='umxhq', targets=('vocals', 'drums', 'bass', 'other'), samplerate=44100, device='cpu', softmask=False, residual_model=False, alpha=1.0, niter=1): """ generate 4 subtargets """ # ENTREE : input path # SORTIE : OUTPUT PATH NOM DE DOSSIER ECRIT LES SUBTARGETS EN .WAV DANS CE PATH # handling an input audio path audio, rate = sf.read( input_path, always_2d=True, ) if audio.shape[1] > 2: warnings.warn('Channel count > 2! ' 'Only the first two channels will be processed!') audio = audio[:, :2] if rate != samplerate: # resample to model samplerate if needed audio = resampy.resample(audio, rate, samplerate, axis=0) if audio.shape[1] == 1: # if we have mono, let's duplicate it # as the input of OpenUnmix is always stereo audio = np.repeat(audio, 2, axis=1) # convert numpy audio to torch audio_torch = torch.tensor(audio.T[None, ...]).float().to(device) source_names = [] V = [] for j, target in enumerate(tqdm.tqdm(targets)): unmix_target = load_model(target=target, model_name=model_name, device=device) Vj = unmix_target(audio_torch).cpu().detach().numpy() if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj[:, 0, ...]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) X = unmix_target.stft(audio_torch).detach().cpu().numpy() # convert to complex numpy type X = X[..., 0] + X[..., 1] * 1j X = X[0].transpose(2, 1, 0) if residual_model or len(targets) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += (['residual'] if len(targets) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) output_path = Path(output_path) output_path.mkdir(parents=True, exist_ok=True) estimates = {} for j, name in enumerate(source_names): audio_hat = istft(Y[..., j].T, n_fft=unmix_target.stft.n_fft, n_hopsize=unmix_target.stft.n_hop) estimates[name] = audio_hat.T # write wav file in output_path subtarget_path = output_path.joinpath(name + '.wav') sf.write(subtarget_path, estimates[name], samplerate) return estimates
def test_wiener(V, X): X = (X.shape[-1] * torch.ones(X.shape)).to(torch.complex128) Y = norbert.wiener(V, X) assert torch.allclose(Y.sum(-1), X) Y.sum().backward()