예제 #1
0
    def get_model_input(cls, task, audio: Union[str, torch.Tensor]):
        input_type = task.data_cfg.hub.get("input_type", "fbank80")
        if input_type == "fbank80_w_utt_cmvn":
            if isinstance(audio, str):
                feat = utt_cmvn.UtteranceCMVN()(get_fbank(audio))
                feat = feat.unsqueeze(0)  # T x D -> 1 x T x D
            else:
                feat = kaldi.fbank(audio, num_mel_bins=80).numpy()  # 1 x T x D
        elif input_type in {"waveform", "standardized_waveform"}:
            if isinstance(audio, str):
                feat, sr = get_wav(audio)  # C x T
                feat, _ = convert_wav(feat,
                                      sr,
                                      to_sample_rate=16_000,
                                      to_mono=True)  # C x T -> 1 x T
            else:
                feat = audio.numpy()
        else:
            raise ValueError(f"Unknown value: input_type = {input_type}")

        src_lengths = torch.Tensor([feat.shape[1]]).long()
        src_tokens = torch.from_numpy(feat)  # 1 x T (x D)
        if input_type == "standardized_waveform":
            with torch.no_grad():
                src_tokens = F.layer_norm(src_tokens, src_tokens.shape)

        return {
            "net_input": {
                "src_tokens": src_tokens,
                "src_lengths": src_lengths,
                "prev_output_tokens": None,
            },
            "target_lengths": None,
            "speaker": None,
        }
예제 #2
0
def get_features_or_waveform_from_uncompressed_zip(
    path, byte_offset, byte_size, need_waveform=False
):
    assert path.endswith(".zip")
    data = read_from_uncompressed_zip(path, byte_offset, byte_size)
    f = io.BytesIO(data)
    if is_npy_data(data):
        features_or_waveform = np.load(f)
    elif is_flac_or_wav_data(data):
        features_or_waveform = get_waveform(f)[0] if need_waveform else get_fbank(f)
    else:
        raise ValueError(f'Unknown file format for "{path}"')
    return features_or_waveform
예제 #3
0
def get_features_or_waveform_from_stored_zip(
    path,
    byte_offset,
    byte_size,
    need_waveform=False,
    use_sample_rate=None,
):
    assert path.endswith(".zip")
    data = read_from_stored_zip(path, byte_offset, byte_size)
    f = io.BytesIO(data)
    if is_npy_data(data):
        features_or_waveform = np.load(f)
    elif is_sf_audio_data(data):
        features_or_waveform = (get_waveform(
            f, always_2d=False, output_sample_rate=use_sample_rate)[0]
                                if need_waveform else get_fbank(f))
    else:
        raise ValueError(f'Unknown file format for "{path}"')
    return features_or_waveform
def get_features_from_npy_or_audio(path):
    ext = op.splitext(op.basename(path))[1]
    if ext not in {".npy", ".flac", ".wav"}:
        raise ValueError(f'Unsupported file format for "{path}"')
    return np.load(path) if ext == ".npy" else get_fbank(path)
예제 #5
0
def get_features_from_npy_or_audio(path):
    ext = Path(path).suffix
    if ext not in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS:
        raise ValueError(f'Unsupported file format for "{path}"')
    return np.load(path) if ext == ".npy" else get_fbank(path)
예제 #6
0
def get_features_from_npy_or_audio(path):
    ext = op.splitext(op.basename(path))[1]
    if ext not in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS:
        raise ValueError(f'Unsupported file format for "{path}"')
    return np.load(path) if ext == ".npy" else get_fbank(path)