示例#1
0
    def __getitem__(self, index):
        import soundfile as sf
        path_or_fp = os.path.join(self.root_dir, str(self.fnames[index]))
        _path, slice_ptr = parse_path(path_or_fp)
        if len(slice_ptr) == 2:
            byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
            assert is_sf_audio_data(byte_data)
            path_or_fp = io.BytesIO(byte_data)

        if random.random() < self.noise_rir_prob and self.is_training:
            wav = self.noise_rir_dataset.add_noise_rir(path_or_fp)
            curr_sample_rate = self.sample_rate
        else:
            wav, curr_sample_rate = sf.read(path_or_fp, dtype="float32")

        feats = torch.from_numpy(wav).float()
        feats = self.postprocess(feats, curr_sample_rate)

        if random.random() < self.speed_perturb_prob and self.is_training:
            feats = self.sp(feats)

        if random.random() < self.volume_perturb_prob and self.is_training:
            feats = volume_perturb(feats)

        if self.is_save:
            save_path = os.path.join(
                self.is_save_path,
                _path.split('/')[-1].split('.')[0]) + '_augtment.wav'
            self.save_to_wav(feats, save_path)

        return {"id": index, "source": feats}
def get_features_or_waveform(path: str,
                             need_waveform=False,
                             use_sample_rate=None):
    """Get speech features from .npy file or waveform from .wav/.flac file.
    The file may be inside an uncompressed ZIP file and is accessed via byte
    offset and length.

    Args:
        path (str): File path in the format of "<.npy/.wav/.flac path>" or
        "<zip path>:<byte offset>:<byte length>".
        need_waveform (bool): return waveform instead of features.
        use_sample_rate (int): change sample rate for the input wave file

    Returns:
        features_or_waveform (numpy.ndarray): speech features or waveform.
    """
    _path, slice_ptr = parse_path(path)
    if len(slice_ptr) == 0:
        if need_waveform:
            return get_waveform(_path,
                                always_2d=False,
                                output_sample_rate=use_sample_rate)[0]
        return get_features_from_npy_or_audio(_path)
    elif len(slice_ptr) == 2:
        features_or_waveform = get_features_or_waveform_from_stored_zip(
            _path,
            slice_ptr[0],
            slice_ptr[1],
            need_waveform=need_waveform,
            use_sample_rate=use_sample_rate,
        )
    else:
        raise ValueError(f"Invalid path: {path}")

    return features_or_waveform
示例#3
0
def get_feature_value_min_max(feature_paths: List[str]):
    v_min, v_max = 1e-8, -1e-8
    for p in tqdm(feature_paths):
        _path, slice_ptr = parse_path(p)
        assert len(slice_ptr) == 2
        byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
        assert is_npy_data(byte_data)
        path_or_fp = io.BytesIO(byte_data)
        features = np.load(path_or_fp).squeeze()
        v_min = min(v_min, features.min().item())
        v_max = max(v_max, features.max().item())
    return v_min, v_max
示例#4
0
    def __getitem__(self, index):
        import soundfile as sf

        path_or_fp = os.path.join(self.root_dir, str(self.fnames[index]))
        _path, slice_ptr = parse_path(path_or_fp)
        if len(slice_ptr) == 2:
            byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
            assert is_sf_audio_data(byte_data)
            path_or_fp = io.BytesIO(byte_data)

        wav, curr_sample_rate = sf.read(path_or_fp, dtype="float32")

        feats = torch.from_numpy(wav).float()
        feats = self.postprocess(feats, curr_sample_rate)
        return {"id": index, "source": feats}
示例#5
0
    def get_audio(self, index):
        import soundfile as sf

        wav_path = os.path.join(self.audio_root, self.audio_names[index])
        _path, slice_ptr = parse_path(wav_path)
        if len(slice_ptr) == 0:
            wav, cur_sample_rate = sf.read(_path)
        else:
            assert _path.endswith(".zip")
            data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
            f = io.BytesIO(data)
            wav, cur_sample_rate = sf.read(f)
        wav = torch.from_numpy(wav).float()
        wav = self.postprocess(wav, cur_sample_rate)
        return wav
示例#6
0
def load_dataset_raw_to_waveforms(
    file_name,
    dataset_size=None,
    need_waveform=True,
    sample_rate=16000,
    read_using_soundfile=False,
):
    """Load raw dataset from w2v tsv file. Optionally get waveforms"""
    data = []
    with open(file_name, "r") as fp:
        lines = fp.readlines()
        data = [
            os.path.join(lines[0].strip(),
                         line.strip().split("\t")[0]) for line in lines[1:]
        ]

    if dataset_size:
        data = data[:dataset_size]

    if not need_waveform:
        return data

    features = []
    if read_using_soundfile:
        for _i, d in enumerate(data):
            wav = sf.read(d)[0]
            if wav.ndim == 2:
                wav = wav.mean(-1)
            features.append(torch.from_numpy(wav).float().view(1, -1))
    else:
        for i, d in enumerate(data):
            _path, slice_ptr = parse_path(d)
            if len(slice_ptr) == 0:
                feat = get_waveform(_path,
                                    always_2d=True,
                                    output_sample_rate=sample_rate)[0]
                features.append({
                    "id": i,
                    "net_input": {
                        "src_tokens": torch.tensor(feat),
                        "src_lengths": torch.tensor([feat.shape[1]]),
                    },
                })
            else:
                raise Exception("Currently unsupported data format")
    return features