def prepare_yesno( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. It's expected to contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there are 8 x's and each x is either 1 or 0. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is either "train" or "test", and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) wave_files = list(corpus_dir.glob("*.wav")) assert len(wave_files) == 60 wave_files.sort() train_set = wave_files[::2] test_set = wave_files[1::2] assert len(train_set) == 30 assert len(test_set) == 30 manifests = defaultdict(dict) for name, dataset in zip(["train", "test"], [train_set, test_set]): recordings, supervisions = _prepare_dataset(dataset) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{name}.json") recording_set.to_json(output_dir / f"recordings_{name}.json") manifests[name] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def download_voxceleb1( target_dir: Pathlike = ".", force_download: Optional[bool] = False, ) -> Path: """ Download and unzip the VoxCeleb1 data. .. note:: A "connection refused" error may occur if you are downloading without a password. :param target_dir: Pathlike, the path of the dir to store the dataset. :param force_download: bool, if True, download the archive even if it already exists. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) zip_name = "vox1_dev_wav.zip" zip_path = target_dir / zip_name if zip_path.exists() and not force_download: logging.info(f"Skipping {zip_name} because file exists.") else: # Download the data in parts for url in VOXCELEB1_PARTS_URL: urlretrieve_progress( url, desc=f"Downloading VoxCeleb1 {url.split('/')[-1]}" ) # Combine the parts for dev set with open(zip_name, "wb") as outFile: for file in target_dir.glob("vox1_dev_wav_part*"): with open(file, "rb") as inFile: shutil.copyfileobj(inFile, outFile) logging.info(f"Unzipping dev...") with zipfile.ZipFile(zip_path) as zf: zf.extractall(target_dir) logging.info(f"Unzipping test...") with zipfile.ZipFile(target_dir / "vox1_test_wav.zip") as zf: zf.extractall(target_dir) return target_dir
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "mini_librispeech": dataset_parts = set(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob("*")) elif dataset_parts == "auto": dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob("*"))) if not dataset_parts: raise ValueError( f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}" ) elif isinstance(dataset_parts, str): dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) with ThreadPoolExecutor(num_jobs) as ex: for part in tqdm(dataset_parts, desc="Dataset parts"): logging.info(f"Processing LibriSpeech subset: {part}") if manifests_exist(part=part, output_dir=output_dir): logging.info( f"LibriSpeech subset: {part} already prepared - skipping.") continue recordings = [] supervisions = [] part_path = corpus_dir / part futures = [] for trans_path in tqdm(part_path.rglob("*.trans.txt"), desc="Distributing tasks", leave=False): alignments = {} ali_path = trans_path.parent / (trans_path.stem.split(".")[0] + ".alignment.txt") if ali_path.exists(): alignments = parse_alignments(ali_path) # "trans_path" file contains lines like: # # 121-121726-0000 ALSO A POPULAR CONTRIVANCE # 121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE # 121-121726-0002 ANGOR PAIN PAINFUL TO HEAR # # We will create a separate Recording and SupervisionSegment for those. with open(trans_path) as f: for line in f: futures.append( ex.submit(parse_utterance, part_path, line, alignments)) for future in tqdm(futures, desc="Processing", leave=False): result = future.result() if result is None: continue recording, segment = result recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part}.json") recording_set.to_file(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_mls( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, opus: bool = True, num_jobs: int = 1, ) -> Dict[str, Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]: """ Prepare Multilingual LibriSpeech corpus. Returns a dict structured like the following: .. code-block:: python { 'english': { 'train': {'recordings': RecordingSet(...), 'supervisions': SupervisionSet(...)}, 'dev': ..., 'test': ... }, 'polish': { ... }, ... } :param corpus_dir: Path to the corpus root (directories with specific languages should be inside). :param output_dir: Optional path where the manifests should be stored. :param opus: Should we scan for OPUS files (otherwise we'll look for FLAC files). :param num_jobs: How many jobs should be used for creating recording manifests. :return: A dict with structure: ``d[language][split] = {recordings, supervisions}``. """ corpus_dir = Path(corpus_dir) output_dir = Path(output_dir) if output_dir is not None else None assert corpus_dir.is_dir() languages = { d.name.split("_")[1]: d for d in corpus_dir.glob("mls_*") if d.is_dir() and "_lm_" not in d.name and ( opus or not d.name.endswith("opus")) } logging.info(f"Found MLS languages: {list(languages)}") manifests = defaultdict(dict) for lang, lang_dir in tqdm(languages.items(), desc="Langauges", total=len(languages)): logging.info(f"Processing language: {lang}") # Read the speaker to gender mapping. spk2gender = {} for line in (lang_dir / "metainfo.txt").read_text().splitlines(): spk, gender, *_ = line.split("|") spk2gender[spk.strip()] = gender.strip() for split in tqdm(["test", "dev", "train"], desc="Splits"): # If everything is ready, read it and skip it. recordings_path = (None if output_dir is None else output_dir / f"recordings_{lang}_{split}.jsonl.gz") supervisions_path = (None if output_dir is None else output_dir / f"supervisions_{lang}_{split}.jsonl.gz") if (recordings_path is not None and recordings_path.is_file() and supervisions_path is not None and supervisions_path.is_file()): logging.info(f"Skipping - {lang}/{split} - already exists!") recordings = RecordingSet.from_file(recordings_path) supervisions = SupervisionSet.from_file(supervisions_path) manifests[lang][split] = { "recordings": recordings, "supervisions": supervisions, } continue # Create recordings manifest. split_dir = lang_dir / split recordings = RecordingSet.from_dir( path=split_dir, pattern="*.opus" if opus else "*.flac", num_jobs=num_jobs, force_opus_sampling_rate=16000, ) # Create supervisions manifest. supervisions = [] for line in (split_dir / "transcripts.txt").read_text().splitlines(): recording_id, text = line.split("\t") speaker = recording_id.split("_")[0] supervisions.append( SupervisionSegment( id=recording_id, recording_id=recording_id, text=text, speaker=speaker, gender=spk2gender[speaker], start=0.0, duration=recordings.duration(recording_id), language=lang, )) supervisions = SupervisionSet.from_segments(supervisions) # Fix any missing recordings/supervisions. recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) # Save for return. manifests[lang][split] = { "recordings": recordings, "supervisions": supervisions, } # Optional storage on disk. if output_dir is not None: output_dir.mkdir(exist_ok=True, parents=True) recordings.to_jsonl(recordings_path) supervisions.to_jsonl(supervisions_path) return dict(manifests)
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if dataset_parts == 'auto': dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob('*'))) if not dataset_parts: raise ValueError( f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}" ) elif isinstance(dataset_parts, str): dataset_parts = [dataset_parts] if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) if maybe_manifests is not None: return maybe_manifests manifests = defaultdict(dict) with ThreadPoolExecutor(num_jobs) as ex: for part in tqdm(dataset_parts, desc='Dataset parts'): recordings = [] supervisions = [] part_path = corpus_dir / part futures = [] for trans_path in tqdm(part_path.rglob('*.txt'), desc='Distributing tasks', leave=False): # "trans_path" file contains lines like: # # 121-121726-0000 ALSO A POPULAR CONTRIVANCE # 121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE # 121-121726-0002 ANGOR PAIN PAINFUL TO HEAR # # We will create a separate Recording and SupervisionSegment for those. with open(trans_path) as f: for line in f: futures.append( ex.submit(parse_utterance, part_path, line)) for future in tqdm(futures, desc='Processing', leave=False): result = future.result() if result is None: continue recording, segment = result recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return dict(manifests) # Convert to normal dict
def prepare_commonvoice( corpus_dir: Pathlike, output_dir: Pathlike, languages: Union[str, Sequence[str]] = "auto", splits: Union[str, Sequence[str]] = COMMONVOICE_DEFAULT_SPLITS, num_jobs: int = 1, ) -> Dict[str, Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. This function expects the input directory structure of:: >>> metadata_path = corpus_dir / language_code / "{train,dev,test}.tsv" >>> # e.g. pl_train_metadata_path = "/path/to/cv-corpus-7.0-2021-07-21/pl/train.tsv" >>> audio_path = corpus_dir / language_code / "clips" >>> # e.g. pl_audio_path = "/path/to/cv-corpus-7.0-2021-07-21/pl/clips" Returns a dict with 3-level structure (lang -> split -> manifest-type):: >>> {'en/fr/pl/...': {'train/dev/test': {'recordings/supervisions': manifest}}} :param corpus_dir: Pathlike, the path to the downloaded corpus. :param output_dir: Pathlike, the path where to write the manifests. :param languages: 'auto' (prepare all discovered data) or a list of language codes. :param splits: by default ``['train', 'dev', 'test']``, can also include ``'validated'``, ``'invalidated'``, and ``'other'``. :param num_jobs: How many concurrent workers to use for scanning of the audio files. :return: a dict with manifests for all specified languagues and their train/dev/test splits. """ if not is_module_available("pandas"): raise ValueError( "To prepare CommonVoice data, please 'pip install pandas' first.") if num_jobs > 1: warnings.warn( "num_jobs>1 currently not supported for CommonVoice data prep;" "setting to 1.") corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" assert output_dir is not None, ( "CommonVoice recipe requires to specify the output " "manifest directory (output_dir cannot be None).") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) if languages == "auto": languages = set(COMMONVOICE_LANGS).intersection( path.name for path in corpus_dir.glob("*")) if not languages: raise ValueError( f"Could not find any of CommonVoice languages in: {corpus_dir}" ) elif isinstance(languages, str): languages = [languages] manifests = {} for lang in tqdm(languages, desc="Processing CommonVoice languages"): logging.info(f"Language: {lang}") lang_path = corpus_dir / lang # Maybe the manifests already exist: we can read them and save a bit of preparation time. # Pattern: "cv_recordings_en_train.jsonl.gz" / "cv_supervisions_en_train.jsonl.gz" lang_manifests = read_cv_manifests_if_cached(output_dir=output_dir, language=lang) for part in splits: logging.info(f"Split: {part}") if part in lang_manifests: logging.info( f"CommonVoice language: {lang} already prepared - skipping." ) continue recording_set, supervision_set = prepare_single_commonvoice_tsv( lang=lang, part=part, output_dir=output_dir, lang_path=lang_path, ) lang_manifests[part] = { "supervisions": supervision_set, "recordings": recording_set, } manifests[lang] = lang_manifests return manifests
def prepare_hifitts( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the HiFiTTS dataset. :param corpus_dir: Path or str, the path to the downloaded corpus main directory. :param output_dir: Path or str, the path where to write the manifests. :param num_jobs: How many concurrent workers to use for preparing each dataset partition. :return: a dict with manifests for all the partitions (example query: ``manifests['92_clean_train']['recordings']``). """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = {} json_manifests = list(corpus_dir.glob("*.json")) dataset_partitions = [to_partition_id(p) for p in json_manifests] if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_partitions, output_dir=output_dir, prefix="hifitts") with ProcessPoolExecutor(num_jobs) as ex: futures = [] partition_ids = [] for raw_manifest_path in json_manifests: speaker_id, _, clean_or_other, part = raw_manifest_path.stem.split( "_") partition_id = to_partition_id(raw_manifest_path) if manifests_exist(part=partition_id, output_dir=output_dir, prefix="hifitts"): logging.info( f"HiFiTTS subset: {part} already prepared - skipping.") continue futures.append( ex.submit( prepare_single_partition, raw_manifest_path, corpus_dir, speaker_id, clean_or_other, )) partition_ids.append(partition_id) for future, partition_id in tqdm( zip(as_completed(futures), partition_ids), desc="Preparing HiFiTTS parts", total=len(futures), ): recordings, supervisions = future.result() if output_dir is not None: supervisions.to_json( output_dir / f"hifitts_supervisions_{partition_id}.json") recordings.to_json(output_dir / f"hifitts_recordings_{partition_id}.json") manifests[partition_id] = { "recordings": recordings, "supervisions": supervisions, } return manifests