def separate(fileIn, fileOut, modelname): audio, samplerate = librosa.load(fileIn, sr=22050) snips = snipify(audio) specs, stfts = SPECify(snips) model = autoencoder.loadModel() model.load_weights(modelname) sourceSpecs = model.predict(specs) sourceaudio = np.array([]) for i in range(0, sourceSpecs.shape[0]): sourceSpec = sourceSpecs[i].T stft = stfts[i] stft = np.expand_dims(stft, axis=2) sourceSpec = np.expand_dims(sourceSpec, axis=2) sourceSpec = np.expand_dims(sourceSpec, axis=3) print(sourceSpec.shape, stft.shape) resi = norbert.residual_model(sourceSpec, stft.astype(np.complex128), 1) sourceSpecNorbert = norbert.wiener(resi, stft.astype(np.complex128), 1, use_softmask=False) sourceSpecNorbert1 = sourceSpecNorbert[:, ..., 0, 0] sourceaudio = np.append(sourceaudio, librosa.istft(sourceSpecNorbert1)) soundfile.write(fileOut, sourceaudio, samplerate)
def test_residual_copy(X, V): X0 = X.clone() V0 = V.clone() _ = norbert.residual_model(V, X) assert torch.allclose(X0, X) assert torch.allclose(V0, V)
def test_residual_copy(X, V): X0 = np.copy(X) V0 = np.copy(V) _ = norbert.residual_model(V, X) assert np.allclose(X0, X) assert np.allclose(V0, V)
def test_shapes(V, X): Y = norbert.residual_model(V, X) assert X.shape == Y.shape[:-1] Y = norbert.wiener(V, X) assert X.shape == Y.shape[:-1] Y = norbert.softmask(V, X) assert X.shape == Y.shape[:-1]
def PostProcess(Y, stft): stft = np.expand_dims(stft, axis=2) Y = np.expand_dims(Y.T, axis=3) resi = norbert.residual_model(Y, stft.astype(np.complex128), 1) YNorbert = norbert.wiener(resi, stft.astype(np.complex128), 1, use_softmask=False) YNorbert1 = YNorbert[:, ..., 0, 0] Yaudio = librosa.istft(YNorbert1) return Yaudio
def separate(audio, targets, model_name='umxhq', niter=1, softmask=False, alpha=1, residual_model=False, device='cpu'): # convert numpy audio to torch audio_torch = torch.tensor(audio.T[None, ...]).float().to(device) source_names = [] V = [] for j, target in enumerate(tqdm.tqdm(targets)): unmix_target = load_model(target=target, model_name=model_name, device=device) Vj = unmix_target(audio_torch).cpu().detach().numpy() if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj[:, 0, ...]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) X = unmix_target.stft(audio_torch).detach().cpu().numpy() # convert to complex numpy type X = X[..., 0] + X[..., 1] * 1j X = X[0].transpose(2, 1, 0) if residual_model or len(targets) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += (['residual'] if len(targets) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft(Y[..., j].T, n_fft=unmix_target.stft.n_fft, n_hopsize=unmix_target.stft.n_hop) estimates[name] = audio_hat.T return estimates
def separate( audio, targets, model_name='umxhq', niter=1, softmask=False, alpha=1.0, residual_model=False, device='cpu' ): """ Performing the separation on audio input Parameters ---------- audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)] mixture audio targets: list of str a list of the separation targets. Note that for each target a separate model is expected to be loaded. model_name: str name of torchhub model or path to model folder, defaults to `umxhq` niter: int Number of EM steps for refining initial estimates in a post-processing stage, defaults to 1. softmask: boolean if activated, then the initial estimates for the sources will be obtained through a ratio mask of the mixture STFT, and not by using the default behavior of reconstructing waveforms by using the mixture phase, defaults to False alpha: float changes the exponent to use for building ratio masks, defaults to 1.0 residual_model: boolean computes a residual target, for custom separation scenarios when not all targets are available, defaults to False device: str set torch device. Defaults to `cpu`. Returns ------- estimates: `dict` [`str`, `np.ndarray`] dictionary of all restimates as performed by the separation model. """ # convert numpy audio to torch audio_torch = torch.tensor(audio.T[None, ...]).float().to(device) source_names = [] V = [] for j, target in enumerate(tqdm.tqdm(targets)): unmix_target = load_model( target=target, model_name=model_name, device=device ) Vj = unmix_target(audio_torch).cpu().detach().numpy() if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj[:, 0, ...]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) X = unmix_target.stft(audio_torch).detach().cpu().numpy() # convert to complex numpy type X = X[..., 0] + X[..., 1]*1j X = X[0].transpose(2, 1, 0) if residual_model or len(targets) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += (['residual'] if len(targets) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft( Y[..., j].T, n_fft=unmix_target.stft.n_fft, n_hopsize=unmix_target.stft.n_hop ) estimates[name] = audio_hat.T return estimates
def separate( audio, model_path='models/x-umx.h5', niter=1, softmask=False, alpha=1.0, residual_model=False ): """ Performing the separation on audio input Parameters ---------- audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)] mixture audio model_path: str path to model folder, defaults to `models/` niter: int Number of EM steps for refining initial estimates in a post-processing stage, defaults to 1. softmask: boolean if activated, then the initial estimates for the sources will be obtained through a ratio mask of the mixture STFT, and not by using the default behavior of reconstructing waveforms by using the mixture phase, defaults to False alpha: float changes the exponent to use for building ratio masks, defaults to 1.0 residual_model: boolean computes a residual target, for custom separation scenarios when not all targets are available, defaults to False Returns ------- estimates: `dict` [`str`, `np.ndarray`] dictionary of all restimates as performed by the separation model. """ # convert numpy audio to NNabla Variable audio_nn = nn.Variable.from_numpy_array(audio.T[None, ...]) source_names = [] V = [] sources = ['bass', 'drums', 'vocals', 'other'] for j, target in enumerate(sources): if j == 0: unmix_target = model.OpenUnmix_CrossNet(max_bin=1487) unmix_target.is_predict = True nn.load_parameters(model_path) mix_spec, msk, _ = unmix_target(audio_nn, test=True) Vj = msk[Ellipsis, j*2:j*2+2, :] * mix_spec if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj.d[:, 0, ...]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) real, imag = model.STFT(audio_nn, center=True) # convert to complex numpy type X = real.d + imag.d*1j X = X[0].transpose(2, 1, 0) if residual_model or len(sources) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += (['residual'] if len(sources) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft( Y[..., j].T, n_fft=unmix_target.n_fft, n_hopsize=unmix_target.n_hop ) estimates[name] = audio_hat.T return estimates
def separate(audio, args): """ Performing the separation on audio input Parameters ---------- audio: np.ndarray [shape=(nb_timesteps, nb_channels)] mixture audio args : ArgumentParser ArgumentParser for OpenUnmix_CrossNet(X-UMX)/OpenUnmix(UMX) Inference Returns ------- estimates: `dict` [`str`, `np.ndarray`] dictionary of all estimates as performed by the separation model. """ # convert numpy audio to NNabla Variable audio_nn = nn.Variable.from_numpy_array(audio.T[None, ...]) source_names = [] V = [] max_bin = bandwidth_to_max_bin(sample_rate=44100, n_fft=4096, bandwidth=16000) if not args.umx_infer: # Run X-UMX Inference nn.load_parameters(args.model) for j, target in enumerate(args.targets): if j == 0: unmix_target = model.OpenUnmix_CrossNet(max_bin=max_bin, is_predict=True) mix_spec, msk, _ = unmix_target(audio_nn, test=True) # Network output is (nb_frames, nb_samples, nb_channels, nb_bins) V.append((msk[Ellipsis, j * 2:j * 2 + 2, :] * mix_spec).d[:, 0, ...]) source_names += [target] else: # Run UMX Inference for j, target in enumerate(args.targets): with nn.parameter_scope(target): unmix_target = model.OpenUnmix(max_bin=max_bin) nn.load_parameters(f"{os.path.join(args.model, target)}.h5") # Network output is (nb_frames, nb_samples, nb_channels, nb_bins) V.append(unmix_target(audio_nn, test=True).d[:, 0, ...]) source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) if args.softmask: # only exponentiate the model if we use softmask V = V**args.alpha real, imag = model.get_stft(audio_nn, center=True) # convert to complex numpy type X = real.d + imag.d * 1j X = X[0].transpose(2, 1, 0) if args.residual_model or len(args.targets) == 1: V = norbert.residual_model(V, X, args.alpha if args.softmask else 1) source_names += (['residual'] if len(args.targets) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), args.niter, use_softmask=args.softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft(Y[..., j].T, n_fft=unmix_target.n_fft, n_hopsize=unmix_target.n_hop) estimates[name] = audio_hat.T return estimates
X = torch.split(x, num_frames, dim=3) Vj = [] # holds vocals' spectrograms. for i in tqdm.tqdm(range(len(X)), desc='Estimating vocals..'): Vj.append(model(X[i])) Vj = torch.cat(Vj, dim=3).cpu().detach().numpy() # Prepare input for MWF. print('Calculating MWF..') V_vox = np.transpose(Vj, [3, 0, 1, 2]) V.append(V_vox[:, 0, ...]) # remove sample dim V = np.transpose(np.array(V), (1, 3, 2, 0)) X = model.mdensenet.stft(audio).detach().cpu().numpy() X = X[..., 0] + X[..., 1] * 1j X = X[0].transpose(2, 1, 0) V = norbert.residual_model(V, X, 1) Y = norbert.wiener(V, X.astype(np.complex128), 1, use_softmask=False) # Extract source estimates in time domain. s = [] estimates = {} for j in range(Y.shape[-1]): audio_hat = istft(Y[..., j].T, n_fft=n_fft, n_hop=n_hop, sr=sr) s.append(audio_hat.T) end_time = time.time() print(f'Separation duration: {end_time - start_time:.2f} sec.') print('Saving track..') out_name = Path(args.out_name).expanduser() out_name.parent.mkdir(parents=True, exist_ok=True)
def separate( audio, x_umx_target, instruments, niter=1, softmask=False, alpha=1.0, residual_model=False, device="cpu", ): """ Performing the separation on audio input Parameters ---------- audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)] mixture audio x_umx_target: asteroid.models X-UMX model used for separating instruments: list The list of instruments, e.g., ["bass", "drums", "vocals"] niter: int Number of EM steps for refining initial estimates in a post-processing stage, defaults to 1. softmask: boolean if activated, then the initial estimates for the sources will be obtained through a ratio mask of the mixture STFT, and not by using the default behavior of reconstructing waveforms by using the mixture phase, defaults to False alpha: float changes the exponent to use for building ratio masks, defaults to 1.0 residual_model: boolean computes a residual target, for custom separation scenarios when not all targets are available, defaults to False device: str set torch device. Defaults to `cpu`. Returns ------- estimates: `dict` [`str`, `np.ndarray`] dictionary with all estimates obtained by the separation model. """ # convert numpy audio to torch audio_torch = torch.tensor(audio.T[None, ...]).float().to(device) source_names = [] V = [] masked_tf_rep, _ = x_umx_target(audio_torch) # shape: (Sources, frames, batch, channels, fbin) for j, target in enumerate(instruments): Vj = masked_tf_rep[j, Ellipsis].cpu().detach().numpy() if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj[:, 0, Ellipsis]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) # convert to complex numpy type tmp = x_umx_target.encoder(audio_torch) X = torch_complex_from_magphase(tmp[0].permute(1, 2, 3, 0), tmp[1]) X = X.detach().cpu().numpy() X = X[0].transpose(2, 1, 0) if residual_model or len(instruments) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += ["residual" ] if len(instruments) > 1 else ["accompaniment"] Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) estimates = {} for j, name in enumerate(source_names): audio_hat = istft( Y[..., j].T, rate=x_umx_target.sample_rate, n_fft=x_umx_target.in_chan, n_hopsize=x_umx_target.n_hop, ) estimates[name] = audio_hat.T return estimates
def separate(input_path, output_path, model_name='umxhq', targets=('vocals', 'drums', 'bass', 'other'), samplerate=44100, device='cpu', softmask=False, residual_model=False, alpha=1.0, niter=1): """ generate 4 subtargets """ # ENTREE : input path # SORTIE : OUTPUT PATH NOM DE DOSSIER ECRIT LES SUBTARGETS EN .WAV DANS CE PATH # handling an input audio path audio, rate = sf.read( input_path, always_2d=True, ) if audio.shape[1] > 2: warnings.warn('Channel count > 2! ' 'Only the first two channels will be processed!') audio = audio[:, :2] if rate != samplerate: # resample to model samplerate if needed audio = resampy.resample(audio, rate, samplerate, axis=0) if audio.shape[1] == 1: # if we have mono, let's duplicate it # as the input of OpenUnmix is always stereo audio = np.repeat(audio, 2, axis=1) # convert numpy audio to torch audio_torch = torch.tensor(audio.T[None, ...]).float().to(device) source_names = [] V = [] for j, target in enumerate(tqdm.tqdm(targets)): unmix_target = load_model(target=target, model_name=model_name, device=device) Vj = unmix_target(audio_torch).cpu().detach().numpy() if softmask: # only exponentiate the model if we use softmask Vj = Vj**alpha # output is nb_frames, nb_samples, nb_channels, nb_bins V.append(Vj[:, 0, ...]) # remove sample dim source_names += [target] V = np.transpose(np.array(V), (1, 3, 2, 0)) X = unmix_target.stft(audio_torch).detach().cpu().numpy() # convert to complex numpy type X = X[..., 0] + X[..., 1] * 1j X = X[0].transpose(2, 1, 0) if residual_model or len(targets) == 1: V = norbert.residual_model(V, X, alpha if softmask else 1) source_names += (['residual'] if len(targets) > 1 else ['accompaniment']) Y = norbert.wiener(V, X.astype(np.complex128), niter, use_softmask=softmask) output_path = Path(output_path) output_path.mkdir(parents=True, exist_ok=True) estimates = {} for j, name in enumerate(source_names): audio_hat = istft(Y[..., j].T, n_fft=unmix_target.stft.n_fft, n_hopsize=unmix_target.stft.n_hop) estimates[name] = audio_hat.T # write wav file in output_path subtarget_path = output_path.joinpath(name + '.wav') sf.write(subtarget_path, estimates[name], samplerate) return estimates