def check_dependencies(segment_words: Optional[bool] = False): if not is_module_available('pandas'): raise ImportError( "GALE Mandarin data preparation requires the 'pandas' package to be installed. " "Please install it with 'pip install pandas' and try again.") if segment_words and not is_module_available('jieba'): raise ImportError( "The '--segment-words' option requires the 'jieba' package to be installed. " "Please install it with 'pip install jieba' and try again.")
def __init__( self, pattern: str, maxcount: int = 100000, maxsize: float = 3e9, post: Optional[Callable] = None, start_shard: int = 0, **kw, ): """Create a ShardWriter. :param pattern: output file pattern :param maxcount: maximum number of records per shard (Default value = 100000) :param maxsize: maximum size of each shard (Default value = 3e9) :param kw: other options passed to TarWriter """ if not is_module_available("webdataset"): raise ImportError("Please 'pip install webdataset' first.") self.verbose = 1 self.kw = kw self.maxcount = maxcount self.maxsize = maxsize self.post = post self.tarstream = None self.shard = start_shard self.pattern = pattern self.total = 0 self.count = 0 self.size = 0 self.fname = None self.next_stream()
def __init__( self, path_or_url: Pathlike, shard_size: Optional[int] = None, audio_format: str = "flac", load_audio: bool = True, load_features: bool = True, load_custom: bool = True, fault_tolerant: bool = True, ) -> None: if not is_module_available("webdataset"): raise ImportError("Please 'pip install webdataset' first.") from webdataset import TarWriter self.path_or_url = path_or_url self.shard_size = shard_size self.audio_format = audio_format self.load_audio = load_audio self.load_features = load_features self.load_custom = load_custom self.fault_tolerant = fault_tolerant if self.shard_size is not None: assert self.shard_size > 0 # Note: this ShardWriter is not from webdataset, but defined below in this file. self.writer_init_fn = partial( ShardWriter, self.path_or_url, maxcount=self.shard_size ) else: self.writer_init_fn = partial(TarWriter, self.path_or_url) self.writer = None self.num_shards_written = None self.finished = None
def get_duration( path: Pathlike, ) -> float: """ Read a audio file, it supports pipeline style wave path and real waveform. :param path: Path to an audio file or a Kaldi-style pipe. :return: float duration of the recording, in seconds. """ path = str(path) if path.strip().endswith("|"): if not is_module_available("kaldi_native_io"): raise ValueError( "To read Kaldi's data dir where wav.scp has 'pipe' inputs, " "please 'pip install kaldi_native_io' first." ) import kaldi_native_io wave = kaldi_native_io.read_wave(path) assert wave.data.shape[0] == 1, f"Expect 1 channel. Given {wave.data.shape[0]}" return wave.duration try: # Try to parse the file using pysoundfile first. import soundfile info = soundfile.info(path) except: # Try to parse the file using audioread as a fallback. info = audioread_info(path) return info.duration
def __init__(self, source: Union[Pathlike, Sequence[Pathlike]], **wds_kwargs) -> None: if not is_module_available("webdataset"): raise ImportError("Please 'pip install webdataset' first.") self.source = source self.wds_kwargs = wds_kwargs
def get_duration(path: Pathlike, ) -> float: """ Read a audio file, it supports pipeline style wave path and real waveform. :param path: Path to an audio file or a Kaldi-style pipe. :return: float duration of the recording, in seconds. """ path = str(path) if path.strip().endswith("|"): if not is_module_available("kaldiio"): raise ValueError( "To read Kaldi's data dir where wav.scp has 'pipe' inputs, " "please 'pip install kaldiio' first.") from kaldiio import load_mat # Note: kaldiio.load_mat returns # (sampling_rate: int, samples: 1-D np.array[int]) sampling_rate, samples = load_mat(path) assert len(samples.shape) == 1 duration = samples.shape[0] / sampling_rate return duration try: # Try to parse the file using pysoundfile first. import soundfile info = soundfile.info(path) except: # Try to parse the file using audioread as a fallback. info = audioread_info(path) return info.duration
def __init__(self, config: Optional[Any] = None): super().__init__(config=config) assert is_module_available( "opensmile" ), 'To use opensmile extractors, please "pip install opensmile" first.' import opensmile if isinstance(self.config.feature_set, str): self.feature_set = opensmile.FeatureSet[self.config.feature_set] else: self.feature_set = self.config.feature_set self.feature_level = opensmile.FeatureLevel(self.config.feature_level) self.smileExtractor = opensmile.Smile( feature_set=self.feature_set, feature_level=self.feature_level, sampling_rate=self.config.sampling_rate, options=self.config.options, loglevel=self.config.loglevel, logfile=self.config.logfile, channels=self.config.channels, mixdown=self.config.mixdown, resample=self.config.resample, num_workers=self.config.num_workers, verbose=self.config.verbose, )
def make_supervisions(xml_path: str, mer_thresh: int) -> None: if not is_module_available("bs4"): raise ValueError( "To prepare MGB2 data, please 'pip install beautifulsoup4' first." ) from bs4 import BeautifulSoup xml_handle = open(xml_path, "r") soup = BeautifulSoup(xml_handle, "xml") return [ SupervisionSegment( id=segment["id"] + "_" + segment["starttime"] + ":" + segment["endtime"], recording_id=segment["id"].split("_utt")[0].replace("_", "-"), start=float(segment["starttime"]), duration=round( float(segment["endtime"]) - float(segment["starttime"]), ndigits=8 ), channel=0, text=" ".join( [ element.string for element in segment.find_all("element") if element.string is not None ] ), language="Arabic", speaker=int(match(r"\w+speaker(\d+)\w+", segment["who"]).group(1)), ) for segment in soup.find_all("segment") if mer_thresh is None or float(segment["WMER"]) <= mer_thresh ]
def __init__(self, storage_path: Pathlike, *args, **kwargs): if not is_module_available('kaldiio'): raise ValueError( "To read Kaldi feats.scp, please 'pip install kaldiio' first.") import kaldiio super().__init__() self.storage_path = storage_path self.storage = kaldiio.load_scp(str(self.storage_path))
def mini_webdataset( urls: Union[Pathlike, Sequence[Pathlike]], epoch: int = 0, shuffle_shards: bool = False, split_by_worker: bool = True, split_by_node: bool = False, ignore_error_shards: bool = True, ): """ Return a pipeline for WebDataset-style data files. This is a convenience function for constructing a partial pipeline that reads from a set of sharded tar files, extracts the individual files, and groups them together into samples (dictionaries). You can use all the methods from `Composable` (`then`, `compose`) and from `Shorthands` (`batched`, `unbatched`, `decode`, `shuffle`, etc.) on the result. .. note: This is a reduced version of ``webdataset.WebDataset`` function, that only uses the functionalities relevant to Lhotse, and makes it possible to disable the node/worker splitting. :param urls: the source URLs: a string or a list. :param epoch: epoch number (used only when ``shuffle_shards`` is enabled). :param shuffle_shards: shuffle the shards if True. Only takes effect when ``urls`` is a list of shard paths/urls. :param split_by_worker: DEPRECATED: always acts as if True. If True, shards are split per DataLoader worker subprocesses, otherwise each dataloader worker will yield the same data. Only takes effect when ``urls`` is a list of shard paths/urls. :param split_by_node: if True, shards are split per node in DDP training, otherwise on each node we'll yield the same data. Only takes effect when ``urls`` is a list of shard paths/urls. :param ignore_error_shards: when ``True``, we tell WebDataset to ignore shards that failed during loading and emit a warning. When ``False``, we won't catch the exceptions. """ if not is_module_available("webdataset"): raise ImportError("Please 'pip install webdataset' first.") from webdataset import DataPipeline, SimpleShardList, reraise_exception from webdataset import split_by_node as split_by_node_ from webdataset import split_by_worker as split_by_worker_ from webdataset import tarfile_to_samples, warn_and_continue wds = DataPipeline(SimpleShardList(urls=urls)) if split_by_node: wds.append(split_by_node_) if split_by_worker: wds.append(split_by_worker_) if shuffle_shards: wds.append(create_shard_shuffler(epoch=epoch)) wds.append( tarfile_to_samples( handler=warn_and_continue if ignore_error_shards else reraise_exception, ) ) return wds
def prepare_single_commonvoice_tsv( lang: str, part: str, output_dir: Pathlike, lang_path: Pathlike, ) -> Tuple[RecordingSet, SupervisionSet]: """ Prepares part of CommonVoice data from a single TSV file. :param lang: string language code (e.g., "en"). :param part: which split to prepare (e.g., "train", "validated", etc.). :param output_dir: path to directory where we will store the manifests. :param lang_path: path to a CommonVoice directory for a specific language (e.g., "/path/to/cv-corpus-7.0-2021-07-21/pl"). :return: a tuple of (RecordingSet, SupervisionSet) objects opened in lazy mode, as CommonVoice manifests may be fairly large in memory. """ if not is_module_available("pandas"): raise ValueError( "To prepare CommonVoice data, please 'pip install pandas' first.") import pandas as pd lang_path = Path(lang_path) output_dir = Path(output_dir) tsv_path = lang_path / f"{part}.tsv" # Read the metadata df = pd.read_csv(tsv_path, sep="\t") # Scan all the audio files with RecordingSet.open_writer( output_dir / f"cv_recordings_{lang}_{part}.jsonl.gz", overwrite=False, ) as recs_writer, SupervisionSet.open_writer( output_dir / f"cv_supervisions_{lang}_{part}.jsonl.gz", overwrite=False, ) as sups_writer: for idx, row in tqdm( df.iterrows(), desc="Processing audio files", total=len(df), ): try: result = parse_utterance(row, lang_path, lang) if result is None: continue recording, segment = result validate_recordings_and_supervisions(recording, segment) recs_writer.write(recording) sups_writer.write(segment) except Exception as e: logging.error( f"Error when processing TSV file: line no. {idx}: '{row}'.\n" f"Original error type: '{type(e)}' and message: {e}") continue recordings = RecordingSet.from_jsonl_lazy(recs_writer.path) supervisions = SupervisionSet.from_jsonl_lazy(sups_writer.path) return recordings, supervisions
def featuresets_names(): """ Returns list of strings with names of pretrained FeatureSets available in opensmile. """ assert is_module_available( "opensmile" ), 'To use opensmile extractors, please "pip install opensmile" first.' import opensmile return list(opensmile.FeatureSet.__members__)
def __init__(self, storage_path: Pathlike, *args, **kwargs): if not is_module_available("kaldi_native_io"): raise ValueError( "To read Kaldi feats.scp, please 'pip install kaldi_native_io' first." ) import kaldi_native_io super().__init__() self.storage_path = storage_path self.storage = kaldi_native_io.RandomAccessFloatMatrixReader( f"scp:{self.storage_path}")
def open_best(path: Pathlike, mode: str = "r"): if is_module_available("smart_open"): from smart_open import smart_open # This will work with JSONL anywhere that smart_open supports, e.g. cloud storage. open_fn = smart_open else: compressed = str(path).endswith(".gz") if compressed and "t" not in mode and "b" not in mode: # Opening as bytes not requested explicitly, use "t" to tell gzip to handle unicode. mode = mode + "t" open_fn = gzip_open_robust if compressed else open return open_fn(path, mode)
def prepare_gigaspeech( gigaspeech: Any, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: if is_module_available('speechcolab'): from speechcolab.datasets.gigaspeech import GigaSpeech else: raise ImportError( 'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab') subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, suffix='jsonl') if maybe_manifests is not None: return maybe_manifests manifests = defaultdict(dict) with ThreadPoolExecutor(num_jobs) as ex: for part in subsets: futures = [] for audio in tqdm(gigaspeech.audios(part), desc='Distributing tasks', leave=False): futures.append(ex.submit(parse_utterance, audio, gigaspeech.root_path)) recordings = [] supervisions = [] for future in tqdm(futures, desc='Processing', leave=False): result = future.result() if result is None: continue recording, segments = result recordings.append(recording) supervisions += segments manifests[part] = { 'recordings': RecordingSet.from_recordings(recordings), 'supervisions': SupervisionSet.from_segments(supervisions) } if output_dir is not None: manifests[part]['recordings'].to_file(output_dir / f'recordings_{part}.jsonl') manifests[part]['supervisions'].to_file(output_dir / f'supervisions_{part}.jsonl') return dict(manifests)
def __init__( self, storage_path: Pathlike, compression_method: Optional[int] = None, *args, **kwargs, ): if not is_module_available("kaldiio"): raise ValueError( "To read Kaldi feats.scp, please 'pip install kaldiio' first.") import kaldiio super().__init__() self.storage_dir = Path(storage_path) self.storage_dir.mkdir(parents=True, exist_ok=True) self.storage_path_ = str(self.storage_dir / "feats.scp") self.storage = kaldiio.WriteHelper( f"ark,scp:{self.storage_dir}/feats.ark,{self.storage_dir}/feats.scp", compression_method=compression_method, )
def __init__( self, storage_path: Pathlike, compression_method: int = 1, *args, **kwargs, ): if not is_module_available("kaldi_native_io"): raise ValueError( "To read Kaldi feats.scp, please 'pip install kaldi_native_io' first." ) import kaldi_native_io super().__init__() self.storage_dir = Path(storage_path) self.storage_dir.mkdir(parents=True, exist_ok=True) self.storage_path_ = str(self.storage_dir / "feats.scp") self.storage = kaldi_native_io.CompressedMatrixWriter( f"ark,scp:{self.storage_dir}/feats.ark,{self.storage_dir}/feats.scp" ) self.compression_method = kaldi_native_io.CompressionMethod(compression_method)
def dereverb_wpe_torch( audio: torch.Tensor, n_fft: int = 512, hop_length: int = 128, taps: int = 10, delay: int = 3, iterations: int = 3, statistics_mode: str = "full", ) -> torch.Tensor: if not is_module_available("nara_wpe"): raise ImportError( "Please install nara_wpe first using 'pip install git+https://github.com/fgnt/nara_wpe' " "(at the time of writing, only GitHub version has a PyTorch implementation)." ) from nara_wpe.torch_wpe import wpe_v6 assert audio.ndim == 2 window = torch.blackman_window(n_fft) Y = torch.stft( audio, n_fft=n_fft, hop_length=hop_length, return_complex=True, window=window, ) Y = Y.permute(1, 0, 2) Z = wpe_v6( Y, taps=taps, delay=delay, iterations=iterations, statistics_mode=statistics_mode, ) z = torch.istft(Z.permute(1, 0, 2), n_fft=n_fft, hop_length=hop_length, window=window) return z
def download_gigaspeech( password: str, target_dir: Pathlike = ".", dataset_parts: Optional[Union[str, Sequence[str]]] = "auto", host: Optional[str] = "tsinghua", ): if is_module_available("speechcolab"): from speechcolab.datasets.gigaspeech import GigaSpeech else: raise ImportError( "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab" ) gigaspeech = GigaSpeech(target_dir) if dataset_parts == "auto": dataset_parts = ("XL", "DEV", "TEST") elif isinstance(dataset_parts, str): dataset_parts = [dataset_parts] for part in dataset_parts: logging.info(f"Downloading GigaSpeech part: {part}") gigaspeech.download(password, "{" + part + "}", host=host)
def _parse_vtt(lines, noise): # Import regex for some special unicode handling that re has issues with if not is_module_available("regex"): raise ImportError( "regex package not found. Please install..." " (pip install regex)" ) else: import regex as re2 noise_pattern = re2.compile(r"\([^)]*\)", re2.UNICODE) apostrophe_pattern = re2.compile(r"(\w)'(\w)") html_tags = re2.compile(r"(&[^ ;]*;)|(</?[iu]>)") blocks = lines.split("\n\n") for i, b in enumerate(blocks, -1): if i > 0 and b.strip() != "": b_lines = b.split("\n") start, end = _parse_time_segment(b_lines[0]) line = " ".join(b_lines[1:]) line_new = line if line.strip("- ") != "": line_parts = noise_pattern.sub(noise, line_new) line_parts = apostrophe_pattern.sub(r"\1\u2019\2", line_parts) line_parts = html_tags.sub("", line_parts) line_parts_new = [] for lp in line_parts.split(noise): line_parts_new.append( "".join( [i for i in filter(_filter, lp.strip().replace("-", " "))] ) ) joiner = " " + noise + " " line_new = joiner.join(line_parts_new) line_new = re2.sub( r"\p{Zs}", lambda m: _normalize_space(m.group(0)), line_new ) line_new = re2.sub(r" +", " ", line_new).strip().lower() yield start, end, line_new
def prepare_aishell4( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ if not is_module_available("textgrid"): raise ValueError( "To prepare AISHELL-4 data, please 'pip install textgrid' first.") import textgrid corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) global_spk_id = {} for part in ["train_L", "train_M", "train_S", "test"]: recordings = [] supervisions = [] wav_path = corpus_dir / part / "wav" for audio_path in wav_path.rglob("*.flac"): idx = audio_path.stem try: tg = textgrid.TextGrid.fromFile( f"{corpus_dir}/{part}/TextGrid/{idx}.TextGrid") except ValueError: logging.warning( f"{idx} has annotation issues. Skipping this recording.") continue recording = Recording.from_file(audio_path) recordings.append(recording) for tier in tg.tiers: local_spk_id = tier.name key = (idx, local_spk_id) if key not in global_spk_id: global_spk_id[key] = f"SPK{len(global_spk_id)+1:04d}" spk_id = global_spk_id[key] for j, interval in enumerate(tier.intervals): if interval.mark != "": start = interval.minTime end = interval.maxTime text = interval.mark segment = SupervisionSegment( id=f"{idx}-{spk_id}-{j}", recording_id=idx, start=start, duration=round(end - start, 4), channel=0, language="Chinese", speaker=spk_id, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part}.jsonl") recording_set.to_file(output_dir / f"recordings_{part}.jsonl") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def check_dependencies(): if not is_module_available("pandas"): raise ImportError( "Gale Arabic data preparation requires the 'pandas' package to be installed. " "Please install it with 'pip install pandas' and try again")
def prepare_commonvoice( corpus_dir: Pathlike, output_dir: Pathlike, languages: Union[str, Sequence[str]] = "auto", splits: Union[str, Sequence[str]] = COMMONVOICE_DEFAULT_SPLITS, num_jobs: int = 1, ) -> Dict[str, Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. This function expects the input directory structure of:: >>> metadata_path = corpus_dir / language_code / "{train,dev,test}.tsv" >>> # e.g. pl_train_metadata_path = "/path/to/cv-corpus-7.0-2021-07-21/pl/train.tsv" >>> audio_path = corpus_dir / language_code / "clips" >>> # e.g. pl_audio_path = "/path/to/cv-corpus-7.0-2021-07-21/pl/clips" Returns a dict with 3-level structure (lang -> split -> manifest-type):: >>> {'en/fr/pl/...': {'train/dev/test': {'recordings/supervisions': manifest}}} :param corpus_dir: Pathlike, the path to the downloaded corpus. :param output_dir: Pathlike, the path where to write the manifests. :param languages: 'auto' (prepare all discovered data) or a list of language codes. :param splits: by default ``['train', 'dev', 'test']``, can also include ``'validated'``, ``'invalidated'``, and ``'other'``. :param num_jobs: How many concurrent workers to use for scanning of the audio files. :return: a dict with manifests for all specified languagues and their train/dev/test splits. """ if not is_module_available("pandas"): raise ValueError( "To prepare CommonVoice data, please 'pip install pandas' first.") if num_jobs > 1: warnings.warn( "num_jobs>1 currently not supported for CommonVoice data prep;" "setting to 1.") corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" assert output_dir is not None, ( "CommonVoice recipe requires to specify the output " "manifest directory (output_dir cannot be None).") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) if languages == "auto": languages = set(COMMONVOICE_LANGS).intersection( path.name for path in corpus_dir.glob("*")) if not languages: raise ValueError( f"Could not find any of CommonVoice languages in: {corpus_dir}" ) elif isinstance(languages, str): languages = [languages] manifests = {} for lang in tqdm(languages, desc="Processing CommonVoice languages"): logging.info(f"Language: {lang}") lang_path = corpus_dir / lang # Maybe the manifests already exist: we can read them and save a bit of preparation time. # Pattern: "cv_recordings_en_train.jsonl.gz" / "cv_supervisions_en_train.jsonl.gz" lang_manifests = read_cv_manifests_if_cached(output_dir=output_dir, language=lang) for part in splits: logging.info(f"Split: {part}") if part in lang_manifests: logging.info( f"CommonVoice language: {lang} already prepared - skipping." ) continue recording_set, supervision_set = prepare_single_commonvoice_tsv( lang=lang, part=part, output_dir=output_dir, lang_path=lang_path, ) lang_manifests[part] = { "supervisions": supervision_set, "recordings": recording_set, } manifests[lang] = lang_manifests return manifests
arr = cuts[1].load_features() assert arr.shape[0] == 100 assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate) @pytest.mark.parametrize( "extractor_type", [ Fbank, Mfcc, TorchaudioFbank, TorchaudioMfcc, pytest.param( KaldifeatFbank, marks=pytest.mark.skipif( not is_module_available("kaldifeat"), reason="Requires kaldifeat to run.", ), ), pytest.param( KaldifeatMfcc, marks=pytest.mark.skipif( not is_module_available("kaldifeat"), reason="Requires kaldifeat to run.", ), ), pytest.param( lambda: LibrosaFbank(LibrosaFbankConfig(sampling_rate=16000)), marks=[ pytest.mark.skipif( not is_module_available("librosa"),
assert (stats["norm_stds"] == read_stats["norm_stds"]).all() @pytest.mark.parametrize( "storage_fn", [ lambda: LilcomFilesWriter(TemporaryDirectory().name), lambda: LilcomHdf5Writer(NamedTemporaryFile().name), lambda: ChunkedLilcomHdf5Writer(NamedTemporaryFile().name), lambda: LilcomChunkyWriter(NamedTemporaryFile().name), lambda: NumpyFilesWriter(TemporaryDirectory().name), lambda: NumpyHdf5Writer(NamedTemporaryFile().name), pytest.param( lambda: KaldiWriter(TemporaryDirectory().name), marks=pytest.mark.skipif( not is_module_available("kaldiio"), reason="kaldiio must be installed for scp+ark feature writing", ), ), ], ) def test_feature_set_builder(storage_fn): recordings: RecordingSet = RecordingSet.from_json( "test/fixtures/audio.json") extractor = Fbank(FbankConfig(sampling_rate=8000)) with storage_fn() as storage: builder = FeatureSetBuilder( feature_extractor=extractor, storage=storage, ) feature_set = builder.process_and_store_recordings(
('yaml', True), ('json', False), ('json', True), ('jsonl', False), ('jsonl', True), ]) def test_generic_serialization(manifests, manifest_type, format, compressed): manifest = manifests[manifest_type] with NamedTemporaryFile(suffix='.' + format + ('.gz' if compressed else '')) as f: store_manifest(manifest, f.name) restored = load_manifest(f.name) assert manifest == restored @pytest.mark.skipif(not is_module_available('pyarrow'), reason='Requires pyarrow') @pytest.mark.parametrize('manifest_type', ['recording_set', 'supervision_set', 'cut_set']) @pytest.mark.parametrize(['format', 'compressed'], [ ('jsonl', False), ('jsonl', True), ]) def test_lazy_jsonl_deserialization(manifests, manifest_type, format, compressed): manifest = manifests[manifest_type] with NamedTemporaryFile(suffix='.' + format + ('.gz' if compressed else '')) as f: store_manifest(manifest, f.name) lazy_manifest = type(manifest).from_jsonl_lazy(f.name) # Test iteration
def download_librispeech( target_dir: Pathlike = ".", dataset_parts: Optional[Union[str, Sequence[str]]] = "mini_librispeech", force_download: bool = False, alignments: bool = False, base_url: str = "http://www.openslr.org/resources", alignments_url: str = LIBRISPEECH_ALIGNMENTS_URL, ) -> Path: """ Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech :param target_dir: Pathlike, the path of the dir to storage the dataset. :param dataset_parts: "librispeech", "mini_librispeech", or a list of splits (e.g. "dev-clean") to download. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param alignments: should we download the alignments. The original source is: https://github.com/CorentinJ/librispeech-alignments :param base_url: str, the url of the OpenSLR resources. :param alignments_url: str, the url of LibriSpeech word alignments :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) corpus_dir = target_dir / "LibriSpeech" target_dir.mkdir(parents=True, exist_ok=True) if dataset_parts == "librispeech": dataset_parts = LIBRISPEECH elif dataset_parts == "mini_librispeech": dataset_parts = MINI_LIBRISPEECH elif isinstance(dataset_parts, str): dataset_parts = [dataset_parts] for part in tqdm(dataset_parts, desc="Downloading LibriSpeech parts"): logging.info(f"Processing split: {part}") # Determine the valid URL for a given split. if part in LIBRISPEECH: url = f"{base_url}/12" elif part in MINI_LIBRISPEECH: url = f"{base_url}/31" else: logging.warning(f"Invalid dataset part name: {part}") continue # Split directory exists and seem valid? Skip this split. part_dir = corpus_dir / part completed_detector = part_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping {part} because {completed_detector} exists.") continue # Maybe-download the archive. tar_name = f"{part}.tar.gz" tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") # Remove partial unpacked files, if any, and unpack everything. shutil.rmtree(part_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch() if alignments: completed_detector = target_dir / ".ali_completed" if completed_detector.is_file() and not force_download: return corpus_dir assert is_module_available( "gdown" ), 'To download LibriSpeech alignments, please install "pip install gdown"' import gdown ali_zip_path = str(target_dir / "LibriSpeech-Alignments.zip") gdown.download(alignments_url, output=ali_zip_path) with zipfile.ZipFile(ali_zip_path) as f: f.extractall(path=target_dir) completed_detector.touch() return corpus_dir
:param data: the contents of ``manifest.custom`` field. :return: ``custom`` field dict with deserialized manifests (if any), or None when input is None. """ if data is None: return None from lhotse.array import deserialize_array # If any of the values in the input are also dicts, # it indicates that might be a serialized array manifest. # We'll try to deserialize it, and if there is an error, # we'll just leave it as it was. for key, value in data.items(): if isinstance(value, dict): try: data[key] = deserialize_array(value) except: pass return data if is_module_available("orjson"): import orjson decode_json_line = orjson.loads else: decode_json_line = json.loads
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. """ path = Path(path) assert path.is_dir() # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True) durations = defaultdict(float) reco2dur = path / 'reco2dur' if not reco2dur.is_file(): raise ValueError( f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>" ) with reco2dur.open() as f: for line in f: recording_id, dur = line.strip().split() durations[recording_id] = float(dur) recording_set = RecordingSet.from_recordings( Recording(id=recording_id, sources=[ AudioSource(type='command' if path_or_cmd. endswith('|') else 'file', channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith('|') else path_or_cmd) ], sampling_rate=sampling_rate, num_samples=int(durations[recording_id] * sampling_rate), duration=durations[recording_id]) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / 'segments' if segments.is_file(): with segments.open() as f: supervision_segments = [l.strip().split() for l in f] texts = load_kaldi_text_mapping(path / 'text') speakers = load_kaldi_text_mapping(path / 'utt2spk') genders = load_kaldi_text_mapping(path / 'spk2gender') languages = load_kaldi_text_mapping(path / 'utt2lang') supervision_set = SupervisionSet.from_segments( SupervisionSegment(id=segment_id, recording_id=recording_id, start=float(start), duration=float(end) - float(start), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=speakers[segment_id], gender=genders[speakers[segment_id]]) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / 'feats.scp' if feats_scp.exists() and is_module_available('kaldiio'): if frame_shift is not None: import kaldiio from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features(type='kaldiio', num_frames=mat.shape[0], num_features=mat.shape[1], frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat.shape[0] * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id, channels=0) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))) else: warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: " f"frame_shift must be not None. " f"Feature import omitted.") return recording_set, supervision_set, feature_set
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, map_string_to_underscores: Optional[str] = None, num_jobs: int = 1, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. :param map_string_to_underscores: optional string, when specified, we will replace all instances of this string in SupervisonSegment IDs to underscores. This is to help with handling underscores in Kaldi (see :func:`.export_to_kaldi`). This is also done for speaker IDs. """ path = Path(path) assert path.is_dir() def fix_id(t: str) -> str: if map_string_to_underscores is None: return t return t.replace(map_string_to_underscores, "_") # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True) with ProcessPoolExecutor(num_jobs) as ex: dur_vals = ex.map(get_duration, recordings.values()) durations = dict(zip(recordings.keys(), dur_vals)) recording_set = RecordingSet.from_recordings( Recording( id=recording_id, sources=[ AudioSource( type="command" if path_or_cmd.endswith("|") else "file", channels=[0], source=path_or_cmd[:-1] if path_or_cmd.endswith("|") else path_or_cmd, ) ], sampling_rate=sampling_rate, num_samples=compute_num_samples(durations[recording_id], sampling_rate), duration=durations[recording_id], ) for recording_id, path_or_cmd in recordings.items() ) supervision_set = None segments = path / "segments" if segments.is_file(): with segments.open() as f: supervision_segments = [sup_string.strip().split() for sup_string in f] texts = load_kaldi_text_mapping(path / "text") speakers = load_kaldi_text_mapping(path / "utt2spk") genders = load_kaldi_text_mapping(path / "spk2gender") languages = load_kaldi_text_mapping(path / "utt2lang") supervision_set = SupervisionSet.from_segments( SupervisionSegment( id=fix_id(segment_id), recording_id=recording_id, start=float(start), duration=add_durations( float(end), -float(start), sampling_rate=sampling_rate ), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=fix_id(speakers[segment_id]), gender=genders[speakers[segment_id]], ) for segment_id, recording_id, start, end in supervision_segments ) feature_set = None feats_scp = path / "feats.scp" if feats_scp.exists() and is_module_available("kaldi_native_io"): if frame_shift is not None: import kaldi_native_io from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features( type="kaldi_native_io", num_frames=mat_shape.num_rows, num_features=mat_shape.num_cols, frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat_shape.num_rows * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id, channels=0, ) for utt_id, mat_shape in kaldi_native_io.SequentialMatrixShapeReader( f"scp:{feats_scp}" ) ) else: warnings.warn( "Failed to import Kaldi 'feats.scp' to Lhotse: " "frame_shift must be not None. " "Feature import omitted." ) return recording_set, supervision_set, feature_set