def get_features_or_waveform(path: str, need_waveform=False): """Get speech features from .npy file or waveform from .wav/.flac file. The file may be inside an uncompressed ZIP file and is accessed via byte offset and length. Args: path (str): File path in the format of "<.npy/.wav/.flac path>" or "<zip path>:<byte offset>:<byte length>". need_waveform (bool): return waveform instead of features. Returns: features_or_waveform (numpy.ndarray): speech features or waveform. """ _path, *extra = path.split(":") if not op.exists(_path): raise FileNotFoundError(f"File not found: {_path}") if len(extra) == 0: if need_waveform: return get_waveform(_path) return get_features_from_npy_or_audio(_path) elif len(extra) == 2: extra = [int(i) for i in extra] features_or_waveform = get_features_or_waveform_from_uncompressed_zip( _path, extra[0], extra[1], need_waveform=need_waveform) else: raise ValueError(f"Invalid path: {path}") return features_or_waveform
def compute_num_frames_from_feat_or_waveform(rxfile: str) -> int: if re.search(r"\.ark:\d+$", rxfile.strip()) is not None: # from feats.scp if not has_kaldi_io: raise ImportError( "Please install kaldi_io with: pip install kaldi_io") try: feat = kaldi_io.read_mat(rxfile) except Exception: raise Exception("failed to read feature matrix {}.".format(rxfile)) assert feat is not None and isinstance(feat, np.ndarray) num_frames = feat.shape[0] elif re.search(r"\|$", rxfile.strip()) is not None: # from a command source = BytesIO(run(rxfile[:-1], shell=True, stdout=PIPE).stdout) waveform, sample_rate = get_waveform(source, always_2d=True) num_frames = num_samples_to_num_frames(waveform.shape[1], sample_rate, frame_length=25.0, frame_shift=10.0) else: # from a raw waveform file if not has_soundfile: raise ImportError( "Please install soundfile with: pip install soundfile") info = soundfile.info(rxfile) num_frames = num_samples_to_num_frames(info.frames, info.samplerate, frame_length=25.0, frame_shift=10.0) return num_frames
def get_features_or_waveform(path: str, need_waveform=False, use_sample_rate=None): """Get speech features from .npy file or waveform from .wav/.flac file. The file may be inside an uncompressed ZIP file and is accessed via byte offset and length. Args: path (str): File path in the format of "<.npy/.wav/.flac path>" or "<zip path>:<byte offset>:<byte length>". need_waveform (bool): return waveform instead of features. use_sample_rate (int): change sample rate for the input wave file Returns: features_or_waveform (numpy.ndarray): speech features or waveform. """ _path, slice_ptr = parse_path(path) if len(slice_ptr) == 0: if need_waveform: return get_waveform(_path, always_2d=False, output_sample_rate=use_sample_rate)[0] return get_features_from_npy_or_audio(_path) elif len(slice_ptr) == 2: features_or_waveform = get_features_or_waveform_from_stored_zip( _path, slice_ptr[0], slice_ptr[1], need_waveform=need_waveform, use_sample_rate=use_sample_rate, ) else: raise ValueError(f"Invalid path: {path}") return features_or_waveform
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, str, str]: wav_path, offset, n_frames, sr, src_utt, tgt_utt, spk_id, \ utt_id = self.data[n] waveform, _ = get_waveform(wav_path, frames=n_frames, start=offset) waveform = torch.from_numpy(waveform) return waveform, sr, src_utt, tgt_utt, spk_id, utt_id
def get_features_or_waveform_from_uncompressed_zip( path, byte_offset, byte_size, need_waveform=False ): assert path.endswith(".zip") data = read_from_uncompressed_zip(path, byte_offset, byte_size) f = io.BytesIO(data) if is_npy_data(data): features_or_waveform = np.load(f) elif is_flac_or_wav_data(data): features_or_waveform = get_waveform(f)[0] if need_waveform else get_fbank(f) else: raise ValueError(f'Unknown file format for "{path}"') return features_or_waveform
def read_audio(self, path, ref_len=None): path, *extra = path.split(":") assert len(extra) == 2 assert path.endswith(".zip") data = read_from_uncompressed_zip(path, int(extra[0]), int(extra[1])) f = io.BytesIO(data) wav, sr = get_waveform(f) assert sr == self.task.cfg.sample_rate, sr if wav.ndim == 2: wav = wav.mean(-1) assert wav.ndim == 1, wav.ndim if ref_len is not None and abs(ref_len - len(wav)) > 160: logging.warning(f"ref {ref_len} != read {len(wav)} ({path})") return wav
def process( line: str, n_bins: int = 80, feature_type: str = "fbank" ) -> Tuple[np.ndarray, np.ndarray, int]: _, rxfile = line.rstrip().split(None, 1) if re.search(r"\|$", rxfile) is not None: # from a command source = BytesIO(run(rxfile[:-1], shell=True, stdout=PIPE).stdout) else: # from a raw waveform file source = rxfile waveform, sample_rate = get_waveform(source, normalization=False, always_2d=True) feat = get_torchaudio_fbank_or_mfcc( waveform, sample_rate, n_bins=n_bins, feature_type=feature_type ) cur_sum = feat.sum(axis=0) cur_frames = feat.shape[0] cur_unnorm_var = np.var(feat, axis=0) * cur_frames return cur_sum, cur_unnorm_var, cur_frames
def _get_features(self, i): if self.input_format == "feat": feat = kaldi_io.read_mat(self.rxfiles[i]) else: if self.input_format == "command": source = BytesIO(run(self.rxfiles[i][:-1], shell=True, stdout=PIPE).stdout) else: source = self.rxfiles[i] waveform, sample_rate = get_waveform(source, normalization=False, always_2d=True) feat = get_torchaudio_fbank_or_mfcc(waveform, sample_rate, n_bins=self.feat_dim, feature_type=self.feature_type) if self.feature_transforms is not None: feat = self.feature_transforms(feat) if self.specaugment_config is not None and self.specaugment_config != "": with data_utils.numpy_seed(self.seed, self.epoch, i): feat = specaug(feat, **eval(self.specaugment_config)) return feat
def load_dataset_raw_to_waveforms( file_name, dataset_size=None, need_waveform=True, sample_rate=16000, read_using_soundfile=False, ): """Load raw dataset from w2v tsv file. Optionally get waveforms""" data = [] with open(file_name, "r") as fp: lines = fp.readlines() data = [ os.path.join(lines[0].strip(), line.strip().split("\t")[0]) for line in lines[1:] ] if dataset_size: data = data[:dataset_size] if not need_waveform: return data features = [] if read_using_soundfile: for _i, d in enumerate(data): wav = sf.read(d)[0] if wav.ndim == 2: wav = wav.mean(-1) features.append(torch.from_numpy(wav).float().view(1, -1)) else: for i, d in enumerate(data): _path, slice_ptr = parse_path(d) if len(slice_ptr) == 0: feat = get_waveform(_path, always_2d=True, output_sample_rate=sample_rate)[0] features.append({ "id": i, "net_input": { "src_tokens": torch.tensor(feat), "src_lengths": torch.tensor([feat.shape[1]]), }, }) else: raise Exception("Currently unsupported data format") return features
def get_features_or_waveform_from_stored_zip( path, byte_offset, byte_size, need_waveform=False, use_sample_rate=None, ): assert path.endswith(".zip") data = read_from_stored_zip(path, byte_offset, byte_size) f = io.BytesIO(data) if is_npy_data(data): features_or_waveform = np.load(f) elif is_sf_audio_data(data): features_or_waveform = (get_waveform( f, always_2d=False, output_sample_rate=use_sample_rate)[0] if need_waveform else get_fbank(f)) else: raise ValueError(f'Unknown file format for "{path}"') return features_or_waveform