Exemplo n.º 1
0
    def forward(self, mix):
        x = mix

        if self.normalize:
            mono = mix.mean(dim=1, keepdim=True)
            mean = mono.mean(dim=-1, keepdim=True)
            std = mono.std(dim=-1, keepdim=True)
        else:
            mean = 0
            std = 1

        x = (x - mean) / (1e-5 + std)

        if self.resample:
            x = julius.resample_frac(x, 1, 2)

        saved = []
        for encode in self.encoder:
            x = encode(x)
            saved.append(x)
        if self.lstm:
            x = self.lstm(x)
        for decode in self.decoder:
            skip = center_trim(saved.pop(-1), x)
            x = x + skip
            x = decode(x)

        if self.resample:
            x = julius.resample_frac(x, 2, 1)
        x = x * std + mean
        x = x.view(x.size(0), len(self.sources), self.audio_channels,
                   x.size(-1))
        return x
Exemplo n.º 2
0
def load_track(track, device, audio_channels, samplerate):
    errors = {}
    wav = None
    try:
        wav, sr = ta.load(str(track))
    except RuntimeError as err:
        errors['torchaudio'] = err.args[0]
        try:
            wav = AudioFile(track).read(streams=0,
                                        samplerate=samplerate,
                                        channels=audio_channels).to(device)
        except FileNotFoundError:
            errors['ffmpeg'] = 'Ffmpeg is not installed.'
        except subprocess.CalledProcessError:
            errors['ffmpeg'] = 'FFmpeg could not read the file.'
    else:
        wav = convert_audio_channels(wav, audio_channels)
        wav = wav.to(device)
        wav = julius.resample_frac(wav, sr, samplerate)

    if wav is None:
        print(f"Could not load file {track}. "
              "Maybe it is not a supported file format? ")
        for backend, error in errors.items():
            print(
                f"When trying to load using {backend}, got the following error: {error}"
            )
        sys.exit(1)
    return wav
Exemplo n.º 3
0
    def __getitem__(self, index):
        for name, examples in zip(self.metadata, self.num_examples):
            if index >= examples:
                index -= examples
                continue
            meta = self.metadata[name]
            num_frames = -1
            offset = 0
            if self.segment is not None:
                offset = int(meta['samplerate'] * self.shift * index)
                num_frames = int(math.ceil(meta['samplerate'] * self.segment))
            wavs = []
            for source in self.sources:
                file = self.get_file(name, source)
                wav, _ = ta.load(str(file),
                                 frame_offset=offset,
                                 num_frames=num_frames)
                wav = convert_audio_channels(wav, self.channels)
                wavs.append(wav)

            example = th.stack(wavs)
            example = julius.resample_frac(example, meta['samplerate'],
                                           self.samplerate)
            if self.normalize:
                example = (example - meta['mean']) / meta['std']
            if self.segment:
                length = int(self.segment * self.samplerate)
                example = example[..., :length]
                example = F.pad(example, (0, length - example.shape[-1]))
            return example
Exemplo n.º 4
0
def preprocess_and_normalize_audio(wav, current_samplerate, audio_channels,
                                   samplerate):
    wav = convert_audio_channels(wav, audio_channels)
    wav = julius.resample_frac(wav, current_samplerate, samplerate)
    ref = wav.mean(0)
    wav = (wav - ref.mean()) / ref.std()
    return wav, ref
Exemplo n.º 5
0
def preprocess_audio_batch(audio, sr, center=True, hop_size=0.1):
    if audio.dim() == 2:
        audio = torch.mean(audio, axis=1)

    if sr != TARGET_SR:
        audio = julius.resample_frac(audio, sr, TARGET_SR)

    audio_len = audio.size()[0]
    frame_len = TARGET_SR
    hop_len = int(hop_size * TARGET_SR)

    if center:
        audio = center_audio(audio, frame_len)

    audio = pad_audio(audio, frame_len, hop_len)

    n_frames = 1 + int((len(audio) - frame_len) / float(hop_len))
    x = torch.as_strided(
        audio,
        size=(frame_len, n_frames),
        stride=(1, hop_len),
    )
    x = torch.transpose(x, 0, 1)
    x = x.unsqueeze(1)
    return x
Exemplo n.º 6
0
 def test_same_as_downsample(self):
     for _ in range(10):
         x = th.randn(2 * 3 * 4 * 100)
         rolloff = 0.945
         for old_sr in [2, 3, 4]:
             y_resampled = resample_frac(x, old_sr, 1, rolloff=rolloff, zeros=16)
             y_lowpass = lowpass_filter(x, rolloff / old_sr / 2, stride=old_sr, zeros=16)
             self.assertSimilar(y_resampled, y_lowpass, x, f"old_sr={old_sr}")
Exemplo n.º 7
0
    def forward(self, mix):
        x = mix

        if self.resample:
            x = julius.resample_frac(x, 1, 2)
        saved = []
        for encode in self.encoder:
            x = encode(x)
            saved.append(x)
        if self.lstm:
            x = self.lstm(x)
        for decode in self.decoder:
            skip = center_trim(saved.pop(-1), x)
            x = x + skip
            x = decode(x)

        if self.resample:
            x = julius.resample_frac(x, 2, 1)
        x = x.view(x.size(0), self.sources, self.audio_channels, x.size(-1))
        return x
Exemplo n.º 8
0
    def forward(self, mix):
        x = mix
        length = x.shape[-1]

        if self.normalize:
            mono = mix.mean(dim=1, keepdim=True)
            mean = mono.mean(dim=-1, keepdim=True)
            std = mono.std(dim=-1, keepdim=True)
            x = (x - mean) / (1e-5 + std)
        else:
            mean = 0
            std = 1

        delta = self.valid_length(length) - length
        x = F.pad(x, (delta // 2, delta - delta // 2))

        if self.resample:
            x = julius.resample_frac(x, 1, 2)

        saved = []
        for encode in self.encoder:
            x = encode(x)
            saved.append(x)

        if self.lstm:
            x = self.lstm(x)

        for decode in self.decoder:
            skip = saved.pop(-1)
            skip = center_trim(skip, x)
            x = decode(x + skip)

        if self.resample:
            x = julius.resample_frac(x, 2, 1)
        x = x * std + mean
        x = center_trim(x, length)
        x = x.view(x.size(0), len(self.sources), self.audio_channels,
                   x.size(-1))
        return x
Exemplo n.º 9
0
def preprocess_audio_batch(audio, sr, center=True, hop_size=0.1, sampler="julian"):
    if audio.ndim == 3:
        audio = torch.mean(audio, axis=2)

    if sr != TARGET_SR:
        if sampler == "julian":
            audio = julius.resample_frac(audio, sr, TARGET_SR)

        elif sampler == "resampy":
            audio = torch.tensor(
                resampy.resample(
                    audio.detach().cpu().numpy(),
                    sr_orig=sr,
                    sr_new=TARGET_SR,
                    filter="kaiser_best",
                ),
                dtype=audio.dtype,
                device=audio.device,
            )

        else:
            raise ValueError("Only julian and resampy works!")

    frame_len = TARGET_SR
    hop_len = int(hop_size * TARGET_SR)
    if center:
        audio = center_audio(audio, frame_len)

    audio = pad_audio(audio, frame_len, hop_len)
    n_frames = 1 + int((audio.size()[1] - frame_len) / float(hop_len))
    x = []
    xframes_shape = None
    for i in range(audio.shape[0]):
        xframes = (
            torch.as_strided(
                audio[i],
                size=(frame_len, n_frames),
                stride=(1, hop_len),
            )
            .transpose(0, 1)
            .unsqueeze(1)
        )
        if xframes_shape is None:
            xframes_shape = xframes.shape
        assert xframes.shape == xframes_shape
        x.append(xframes)
    x = torch.vstack(x)
    return x
Exemplo n.º 10
0
def test(table, old_sr, new_sr, device="cpu"):
    x = th.randn(16, 8 * old_sr * int(math.ceil(44_100 / old_sr)), device=device)

    with Chrono() as chrono:
        y = resample_frac(x, old_sr, new_sr, zeros=56)
    dur_julius = int(1000 * chrono.duration)

    if device == "cpu":
        with Chrono() as chrono:
            y_resampy = th.from_numpy(resampy.resample(x.numpy(), old_sr, new_sr))
        dur_resampy = int(1000 * chrono.duration)

        delta = (y_resampy - y).abs().mean()
        table.line([old_sr, new_sr, dur_julius, dur_resampy, format(delta, ".1%")])
    else:
        table.line([old_sr, new_sr, dur_julius])
Exemplo n.º 11
0
def convert_audio(wav, from_samplerate, to_samplerate, channels):
    """Convert audio from a given samplerate to a target one and target number of channels."""
    wav = convert_audio_channels(wav, channels)
    return julius.resample_frac(wav, from_samplerate, to_samplerate)
def convert_audio(wav, from_samplerate, to_samplerate, channels):
    wav = convert_audio_channels(wav, channels)
    return julius.resample_frac(wav, from_samplerate, to_samplerate)