def __getitem__(self, index): import soundfile as sf path_or_fp = os.path.join(self.root_dir, str(self.fnames[index])) _path, slice_ptr = parse_path(path_or_fp) if len(slice_ptr) == 2: byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) assert is_sf_audio_data(byte_data) path_or_fp = io.BytesIO(byte_data) if random.random() < self.noise_rir_prob and self.is_training: wav = self.noise_rir_dataset.add_noise_rir(path_or_fp) curr_sample_rate = self.sample_rate else: wav, curr_sample_rate = sf.read(path_or_fp, dtype="float32") feats = torch.from_numpy(wav).float() feats = self.postprocess(feats, curr_sample_rate) if random.random() < self.speed_perturb_prob and self.is_training: feats = self.sp(feats) if random.random() < self.volume_perturb_prob and self.is_training: feats = volume_perturb(feats) if self.is_save: save_path = os.path.join( self.is_save_path, _path.split('/')[-1].split('.')[0]) + '_augtment.wav' self.save_to_wav(feats, save_path) return {"id": index, "source": feats}
def get_features_or_waveform(path: str, need_waveform=False, use_sample_rate=None): """Get speech features from .npy file or waveform from .wav/.flac file. The file may be inside an uncompressed ZIP file and is accessed via byte offset and length. Args: path (str): File path in the format of "<.npy/.wav/.flac path>" or "<zip path>:<byte offset>:<byte length>". need_waveform (bool): return waveform instead of features. use_sample_rate (int): change sample rate for the input wave file Returns: features_or_waveform (numpy.ndarray): speech features or waveform. """ _path, slice_ptr = parse_path(path) if len(slice_ptr) == 0: if need_waveform: return get_waveform(_path, always_2d=False, output_sample_rate=use_sample_rate)[0] return get_features_from_npy_or_audio(_path) elif len(slice_ptr) == 2: features_or_waveform = get_features_or_waveform_from_stored_zip( _path, slice_ptr[0], slice_ptr[1], need_waveform=need_waveform, use_sample_rate=use_sample_rate, ) else: raise ValueError(f"Invalid path: {path}") return features_or_waveform
def get_feature_value_min_max(feature_paths: List[str]): v_min, v_max = 1e-8, -1e-8 for p in tqdm(feature_paths): _path, slice_ptr = parse_path(p) assert len(slice_ptr) == 2 byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) assert is_npy_data(byte_data) path_or_fp = io.BytesIO(byte_data) features = np.load(path_or_fp).squeeze() v_min = min(v_min, features.min().item()) v_max = max(v_max, features.max().item()) return v_min, v_max
def __getitem__(self, index): import soundfile as sf path_or_fp = os.path.join(self.root_dir, str(self.fnames[index])) _path, slice_ptr = parse_path(path_or_fp) if len(slice_ptr) == 2: byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) assert is_sf_audio_data(byte_data) path_or_fp = io.BytesIO(byte_data) wav, curr_sample_rate = sf.read(path_or_fp, dtype="float32") feats = torch.from_numpy(wav).float() feats = self.postprocess(feats, curr_sample_rate) return {"id": index, "source": feats}
def get_audio(self, index): import soundfile as sf wav_path = os.path.join(self.audio_root, self.audio_names[index]) _path, slice_ptr = parse_path(wav_path) if len(slice_ptr) == 0: wav, cur_sample_rate = sf.read(_path) else: assert _path.endswith(".zip") data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) f = io.BytesIO(data) wav, cur_sample_rate = sf.read(f) wav = torch.from_numpy(wav).float() wav = self.postprocess(wav, cur_sample_rate) return wav
def load_dataset_raw_to_waveforms( file_name, dataset_size=None, need_waveform=True, sample_rate=16000, read_using_soundfile=False, ): """Load raw dataset from w2v tsv file. Optionally get waveforms""" data = [] with open(file_name, "r") as fp: lines = fp.readlines() data = [ os.path.join(lines[0].strip(), line.strip().split("\t")[0]) for line in lines[1:] ] if dataset_size: data = data[:dataset_size] if not need_waveform: return data features = [] if read_using_soundfile: for _i, d in enumerate(data): wav = sf.read(d)[0] if wav.ndim == 2: wav = wav.mean(-1) features.append(torch.from_numpy(wav).float().view(1, -1)) else: for i, d in enumerate(data): _path, slice_ptr = parse_path(d) if len(slice_ptr) == 0: feat = get_waveform(_path, always_2d=True, output_sample_rate=sample_rate)[0] features.append({ "id": i, "net_input": { "src_tokens": torch.tensor(feat), "src_lengths": torch.tensor([feat.shape[1]]), }, }) else: raise Exception("Currently unsupported data format") return features