def forward(self, mix): x = mix if self.normalize: mono = mix.mean(dim=1, keepdim=True) mean = mono.mean(dim=-1, keepdim=True) std = mono.std(dim=-1, keepdim=True) else: mean = 0 std = 1 x = (x - mean) / (1e-5 + std) if self.resample: x = julius.resample_frac(x, 1, 2) saved = [] for encode in self.encoder: x = encode(x) saved.append(x) if self.lstm: x = self.lstm(x) for decode in self.decoder: skip = center_trim(saved.pop(-1), x) x = x + skip x = decode(x) if self.resample: x = julius.resample_frac(x, 2, 1) x = x * std + mean x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1)) return x
def load_track(track, device, audio_channels, samplerate): errors = {} wav = None try: wav, sr = ta.load(str(track)) except RuntimeError as err: errors['torchaudio'] = err.args[0] try: wav = AudioFile(track).read(streams=0, samplerate=samplerate, channels=audio_channels).to(device) except FileNotFoundError: errors['ffmpeg'] = 'Ffmpeg is not installed.' except subprocess.CalledProcessError: errors['ffmpeg'] = 'FFmpeg could not read the file.' else: wav = convert_audio_channels(wav, audio_channels) wav = wav.to(device) wav = julius.resample_frac(wav, sr, samplerate) if wav is None: print(f"Could not load file {track}. " "Maybe it is not a supported file format? ") for backend, error in errors.items(): print( f"When trying to load using {backend}, got the following error: {error}" ) sys.exit(1) return wav
def __getitem__(self, index): for name, examples in zip(self.metadata, self.num_examples): if index >= examples: index -= examples continue meta = self.metadata[name] num_frames = -1 offset = 0 if self.segment is not None: offset = int(meta['samplerate'] * self.shift * index) num_frames = int(math.ceil(meta['samplerate'] * self.segment)) wavs = [] for source in self.sources: file = self.get_file(name, source) wav, _ = ta.load(str(file), frame_offset=offset, num_frames=num_frames) wav = convert_audio_channels(wav, self.channels) wavs.append(wav) example = th.stack(wavs) example = julius.resample_frac(example, meta['samplerate'], self.samplerate) if self.normalize: example = (example - meta['mean']) / meta['std'] if self.segment: length = int(self.segment * self.samplerate) example = example[..., :length] example = F.pad(example, (0, length - example.shape[-1])) return example
def preprocess_and_normalize_audio(wav, current_samplerate, audio_channels, samplerate): wav = convert_audio_channels(wav, audio_channels) wav = julius.resample_frac(wav, current_samplerate, samplerate) ref = wav.mean(0) wav = (wav - ref.mean()) / ref.std() return wav, ref
def preprocess_audio_batch(audio, sr, center=True, hop_size=0.1): if audio.dim() == 2: audio = torch.mean(audio, axis=1) if sr != TARGET_SR: audio = julius.resample_frac(audio, sr, TARGET_SR) audio_len = audio.size()[0] frame_len = TARGET_SR hop_len = int(hop_size * TARGET_SR) if center: audio = center_audio(audio, frame_len) audio = pad_audio(audio, frame_len, hop_len) n_frames = 1 + int((len(audio) - frame_len) / float(hop_len)) x = torch.as_strided( audio, size=(frame_len, n_frames), stride=(1, hop_len), ) x = torch.transpose(x, 0, 1) x = x.unsqueeze(1) return x
def test_same_as_downsample(self): for _ in range(10): x = th.randn(2 * 3 * 4 * 100) rolloff = 0.945 for old_sr in [2, 3, 4]: y_resampled = resample_frac(x, old_sr, 1, rolloff=rolloff, zeros=16) y_lowpass = lowpass_filter(x, rolloff / old_sr / 2, stride=old_sr, zeros=16) self.assertSimilar(y_resampled, y_lowpass, x, f"old_sr={old_sr}")
def forward(self, mix): x = mix if self.resample: x = julius.resample_frac(x, 1, 2) saved = [] for encode in self.encoder: x = encode(x) saved.append(x) if self.lstm: x = self.lstm(x) for decode in self.decoder: skip = center_trim(saved.pop(-1), x) x = x + skip x = decode(x) if self.resample: x = julius.resample_frac(x, 2, 1) x = x.view(x.size(0), self.sources, self.audio_channels, x.size(-1)) return x
def forward(self, mix): x = mix length = x.shape[-1] if self.normalize: mono = mix.mean(dim=1, keepdim=True) mean = mono.mean(dim=-1, keepdim=True) std = mono.std(dim=-1, keepdim=True) x = (x - mean) / (1e-5 + std) else: mean = 0 std = 1 delta = self.valid_length(length) - length x = F.pad(x, (delta // 2, delta - delta // 2)) if self.resample: x = julius.resample_frac(x, 1, 2) saved = [] for encode in self.encoder: x = encode(x) saved.append(x) if self.lstm: x = self.lstm(x) for decode in self.decoder: skip = saved.pop(-1) skip = center_trim(skip, x) x = decode(x + skip) if self.resample: x = julius.resample_frac(x, 2, 1) x = x * std + mean x = center_trim(x, length) x = x.view(x.size(0), len(self.sources), self.audio_channels, x.size(-1)) return x
def preprocess_audio_batch(audio, sr, center=True, hop_size=0.1, sampler="julian"): if audio.ndim == 3: audio = torch.mean(audio, axis=2) if sr != TARGET_SR: if sampler == "julian": audio = julius.resample_frac(audio, sr, TARGET_SR) elif sampler == "resampy": audio = torch.tensor( resampy.resample( audio.detach().cpu().numpy(), sr_orig=sr, sr_new=TARGET_SR, filter="kaiser_best", ), dtype=audio.dtype, device=audio.device, ) else: raise ValueError("Only julian and resampy works!") frame_len = TARGET_SR hop_len = int(hop_size * TARGET_SR) if center: audio = center_audio(audio, frame_len) audio = pad_audio(audio, frame_len, hop_len) n_frames = 1 + int((audio.size()[1] - frame_len) / float(hop_len)) x = [] xframes_shape = None for i in range(audio.shape[0]): xframes = ( torch.as_strided( audio[i], size=(frame_len, n_frames), stride=(1, hop_len), ) .transpose(0, 1) .unsqueeze(1) ) if xframes_shape is None: xframes_shape = xframes.shape assert xframes.shape == xframes_shape x.append(xframes) x = torch.vstack(x) return x
def test(table, old_sr, new_sr, device="cpu"): x = th.randn(16, 8 * old_sr * int(math.ceil(44_100 / old_sr)), device=device) with Chrono() as chrono: y = resample_frac(x, old_sr, new_sr, zeros=56) dur_julius = int(1000 * chrono.duration) if device == "cpu": with Chrono() as chrono: y_resampy = th.from_numpy(resampy.resample(x.numpy(), old_sr, new_sr)) dur_resampy = int(1000 * chrono.duration) delta = (y_resampy - y).abs().mean() table.line([old_sr, new_sr, dur_julius, dur_resampy, format(delta, ".1%")]) else: table.line([old_sr, new_sr, dur_julius])
def convert_audio(wav, from_samplerate, to_samplerate, channels): """Convert audio from a given samplerate to a target one and target number of channels.""" wav = convert_audio_channels(wav, channels) return julius.resample_frac(wav, from_samplerate, to_samplerate)
def convert_audio(wav, from_samplerate, to_samplerate, channels): wav = convert_audio_channels(wav, channels) return julius.resample_frac(wav, from_samplerate, to_samplerate)