示例#1
0
def download_vctk(target_dir: Pathlike = '.',
                  force_download: Optional[bool] = False,
                  url: Optional[str] = CREST_VCTK_URL) -> None:
    """
    Download and untar/unzip the VCTK dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param url: str, the url of tarred/zipped VCTK corpus.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    archive_name = url.split('/')[-1]
    archive_path = target_dir / archive_name
    if force_download or not archive_path.is_file():
        urlretrieve_progress(url,
                             filename=archive_path,
                             desc=f'Downloading {archive_name}')
    part_dir = target_dir / archive_name.replace('.zip', '').replace(
        '.tar.gz', '')
    completed_detector = part_dir / '.completed'
    if not completed_detector.is_file():
        shutil.rmtree(part_dir, ignore_errors=True)
        opener = zipfile.ZipFile if archive_name.endswith(
            '.zip') else tarfile.open
        with opener(archive_path) as archive:
            archive.extractall(path=target_dir)
            completed_detector.touch()
示例#2
0
def download_ali_meeting(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    base_url: Optional[
        str
    ] = "https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/",
) -> Path:
    """
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    """
    url = f"{base_url}/AliMeeting/openlr"
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_tar_names = [
        "Train_Ali_far.tar.gz",
        "Train_Ali_near.tar.gz",
        "Eval_Ali.tar.gz",
        "Test_Ali.tar.gz",
    ]
    for tar_name in dataset_tar_names:
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urlretrieve_progress(
                f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}"
            )
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)

    return target_dir
示例#3
0
def download_and_untar(
        target_dir: Pathlike = '.',
        force_download: Optional[bool] = False,
        base_url: Optional[str] = 'http://www.openslr.org/resources') -> None:
    """
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    """

    url = f'{base_url}/33'
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_tar_name = 'data_aishell.tgz'
    resources_tar_name = 'resource_aishell.tgz'
    for tar_name in [dataset_tar_name, resources_tar_name]:
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f'{url}/{tar_name}',
                                 filename=tar_path,
                                 desc=f'Downloading {tar_name}')
        corpus_dir = target_dir / 'aishell'
        extracted_dir = corpus_dir / tar_name[:-4]
        completed_detector = extracted_dir / '.completed'
        if not completed_detector.is_file():
            shutil.rmtree(extracted_dir, ignore_errors=True)
            with tarfile.open(tar_path) as tar:
                tar.extractall(path=corpus_dir)
                completed_detector.touch()
示例#4
0
def download_cmu_indic(
    target_dir: Pathlike = ".",
    speakers: Sequence[str] = SPEAKERS,
    force_download: Optional[bool] = False,
    base_url: Optional[str] = BASE_URL,
) -> None:
    """
    Download and untar the CMU Indic dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param speakers: a list of speakers to download. By default, downloads all.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of CMU Arctic download site.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    for spk in tqdm(speakers, desc="Downloading/unpacking CMU Indic speakers"):
        name = f"cmu_indic_{spk}"
        tar_name = f"{name}.tar.bz2"
        full_url = f"{base_url}{tar_name}"
        tar_path = target_dir / tar_name
        part_dir = target_dir / name
        completed_detector = part_dir / ".completed"
        if completed_detector.is_file():
            logging.info(f"Skiping {spk} because {completed_detector} exists.")
            continue
        if force_download or not tar_path.is_file():
            urlretrieve_progress(full_url,
                                 filename=tar_path,
                                 desc=f"Downloading {tar_name}")
        shutil.rmtree(part_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
        completed_detector.touch()
示例#5
0
def download_mobvoihotwords(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    base_url: Optional[str] = "http://www.openslr.org/resources",
) -> None:
    """
    Downdload and untar the dataset

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    """

    url = f"{base_url}/87"
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_tar_name = "mobvoi_hotword_dataset.tgz"
    resources_tar_name = "mobvoi_hotword_dataset_resources.tgz"
    for tar_name in [dataset_tar_name, resources_tar_name]:
        tar_path = target_dir / tar_name
        corpus_dir = target_dir / "MobvoiHotwords"
        extracted_dir = corpus_dir / tar_name[:-4]
        completed_detector = extracted_dir / ".completed"
        if completed_detector.is_file():
            logging.info(
                f"Skip {tar_name} because {completed_detector} exists.")
            continue
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f"{url}/{tar_name}",
                                 filename=tar_path,
                                 desc=f"Downloading {tar_name}")
        shutil.rmtree(extracted_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=corpus_dir)
        completed_detector.touch()
示例#6
0
def download_vctk(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    url: Optional[str] = CREST_VCTK_URL,
) -> Path:
    """
    Download and untar/unzip the VCTK dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param url: str, the url of tarred/zipped VCTK corpus.
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    archive_name = url.split("/")[-1]
    archive_path = target_dir / archive_name
    part_dir = target_dir / archive_name.replace(".zip", "").replace(".tar.gz", "")
    completed_detector = part_dir / ".completed"
    if completed_detector.is_file():
        logging.info(f"Skipping {archive_name} because {completed_detector} exists.")
        return part_dir
    if force_download or not archive_path.is_file():
        urlretrieve_progress(
            url, filename=archive_path, desc=f"Downloading {archive_name}"
        )
    shutil.rmtree(part_dir, ignore_errors=True)
    opener = zipfile.ZipFile if archive_name.endswith(".zip") else tarfile.open
    with opener(archive_path) as archive:
        archive.extractall(path=target_dir)
    completed_detector.touch()
    return part_dir
示例#7
0
文件: adept.py 项目: glynpu/lhotse
def download_adept(
    target_dir: Pathlike = ".",
    force_download: bool = False,
) -> Path:
    """
    Download and untar the ADEPT dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    corpus_dir = target_dir / "ADEPT"
    completed_detector = corpus_dir / ".completed"
    if completed_detector.is_file():
        logging.info(
            f"Skipping downloading ADEPT because {completed_detector} exists.")
        return corpus_dir
    # Maybe-download the archive.
    zip_name = "ADEPT.zip"
    zip_path = target_dir / zip_name
    if force_download or not zip_path.is_file():
        urlretrieve_progress(ADEPT_URL,
                             filename=zip_path,
                             desc=f"Downloading {zip_name}")
    # Remove partial unpacked files, if any, and unpack everything.
    shutil.rmtree(corpus_dir, ignore_errors=True)
    with zipfile.ZipFile(zip_path) as zip_f:
        zip_f.extractall(path=corpus_dir)
    completed_detector.touch()

    return corpus_dir
示例#8
0
def download_aishell4(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    base_url: Optional[str] = "http://www.openslr.org/resources",
) -> Path:
    """
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    """
    url = f"{base_url}/111"
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_tar_names = [
        "train_L.tar.gz",
        "train_M.tar.gz",
        "train_S.tar.gz",
        "test.tar.gz",
    ]
    for tar_name in dataset_tar_names:
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f"{url}/{tar_name}",
                                 filename=tar_path,
                                 desc=f"Downloading {tar_name}")
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)

    return target_dir
示例#9
0
def download_timit(
    target_dir: Pathlike = ".",
    force_download: bool = False,
    base_url: Optional[str] = "https://data.deepai.org/timit.zip",
) -> None:
    """
    Download and unzip the dataset TIMIT.
    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param force_download: bool, if True, download the zips no matter if the zips exists.
    :param base_url: str, the URL of the TIMIT dataset to download.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    zip_name = "timit.zip"
    zip_path = target_dir / zip_name
    corpus_dir = zip_path.with_suffix("")
    completed_detector = corpus_dir / ".completed"
    if completed_detector.is_file():
        logging.info(
            f"Skipping {zip_name} because {completed_detector} exists.")
        return
    if force_download or not zip_path.is_file():
        urlretrieve_progress(base_url,
                             filename=zip_path,
                             desc=f"Downloading {zip_name}")

    with zipfile.ZipFile(zip_path) as zip_file:
        corpus_dir.mkdir(parents=True, exist_ok=True)
        for names in zip_file.namelist():
            zip_file.extract(names, str(corpus_dir))
示例#10
0
def extract(recording_manifest: Pathlike, output_dir: Pathlike,
            feature_manifest: Optional[Pathlike], storage_type: str,
            lilcom_tick_power: int, root_dir: Optional[Pathlike],
            num_jobs: int):
    """
    Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR,
    with one file per recording (or segment).
    """
    recordings: RecordingSet = RecordingSet.from_json(recording_manifest)
    if root_dir is not None:
        recordings = recordings.with_path_prefix(root_dir)

    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())

    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage'

    with get_writer(storage_type)(storage_path,
                                  tick_power=lilcom_tick_power) as storage:
        feature_set_builder = FeatureSetBuilder(
            feature_extractor=feature_extractor,
            storage=storage,
        )
        feature_set_builder.process_and_store_recordings(
            recordings=recordings,
            output_manifest=output_dir / 'feature_manifest.json.gz',
            num_jobs=num_jobs)
示例#11
0
def download_rir_noise(
    target_dir: Pathlike = ".",
    url: Optional[str] = RIR_NOISE_ZIP_URL,
    force_download: Optional[bool] = False,
) -> Path:
    """
    Download and untar the RIR Noise corpus.

    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param url: str, the url that downloads file called "rirs_noises.zip".
    :param force_download: bool, if True, download the archive even if it already exists.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    zip_name = "rirs_noises.zip"
    zip_path = target_dir / zip_name
    if zip_path.exists() and not force_download:
        logging.info(f"Skipping {zip_name} because file exists.")
    else:
        urlretrieve_progress(url, zip_path, desc=f"Downloading {zip_name}")
        logging.info(f"Downloaded {zip_name}.")
    zip_dir = target_dir / "RIRS_NOISES"
    if not zip_dir.exists():
        logging.info(f"Unzipping {zip_name}.")
        with zipfile.ZipFile(zip_path) as zf:
            zf.extractall(target_dir)
    return zip_dir
示例#12
0
def split(num_splits: int, manifest: Pathlike, output_dir: Pathlike):
    """Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR. """
    output_dir = Path(output_dir)
    manifest = Path(manifest)
    data_set = load_manifest(manifest)
    parts = split_manifest(manifest=data_set, num_splits=num_splits)
    output_dir.mkdir(parents=True, exist_ok=True)
    for idx, part in enumerate(parts):
        part.to_json(output_dir / f'{manifest.stem}.{idx + 1}.json')
示例#13
0
def prepare_norm_cn(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    num_jobs: int = 15,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    dataset_parts = ["dev", "test", "train"]
    for part in dataset_parts:
        transcript_path = corpus_dir / f"{part}/text.txt"
        transcript_dict = {}
        with open(transcript_path, "r", encoding="utf-8") as f:
            for line in f.readlines():
                idx_transcript = line.split()
                if len(idx_transcript) < 2 :
                    logging.info(f"get transcript err: {line}")
                    continue
                transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:])
        file_path = corpus_dir / f"{part}/wav.scp"
        file_paths = []
        with open(file_path, "r", encoding="utf-8") as f:
            file_paths = [line.strip() for line in f]

        recordings = []
        supervisions = []
        with ThreadPoolExecutor(num_jobs) as ex:
            for recording, supervision in tqdm(
                ex.map(
                    process_file,
                    file_paths,
                    repeat(transcript_dict),
                ),
                desc="Processing NormcnSpeech JSON entries",
                leave=False,
            ):
            #for p in file_paths:
            #    recording, supervision = process_file(p, transcript_dict)
                if recording is not None :
                    recordings.append(recording)
                    supervisions.append(supervision)

        supervision_set = SupervisionSet.from_segments(supervisions)
        recording_set = RecordingSet.from_recordings(recordings)
        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{part}.json")
            recording_set.to_json(output_dir / f"recordings_{part}.json")
        manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}
    return manifests
示例#14
0
文件: kaldi.py 项目: zcth428/lhotse
def convert_kaldi(data_dir: Pathlike, sampling_rate: int, manifest_dir: Pathlike):
    """
    Convert a Kaldi data dir DATA_DIR into a directory MANIFEST_DIR of lhotse manifests. Ignores feats.scp.
    The SAMPLING_RATE has to be explicitly specified as it is not available to read from DATA_DIR.
    """
    recording_set, maybe_supervision_set = load_kaldi_data_dir(path=data_dir, sampling_rate=sampling_rate)
    manifest_dir = Path(manifest_dir)
    manifest_dir.mkdir(parents=True, exist_ok=True)
    recording_set.to_json(manifest_dir / 'audio.json')
    if maybe_supervision_set is not None:
        maybe_supervision_set.to_json(manifest_dir / 'supervision.json')
示例#15
0
def split(num_splits: int, manifest: Pathlike, output_dir: Pathlike,
          shuffle: bool):
    """
    Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR.
    """
    output_dir = Path(output_dir)
    manifest = Path(manifest)
    suffix = ''.join(manifest.suffixes)
    any_set = load_manifest(manifest)
    parts = any_set.split(num_splits=num_splits, shuffle=shuffle)
    output_dir.mkdir(parents=True, exist_ok=True)
    for idx, part in enumerate(parts):
        part.to_json(
            (output_dir / manifest).with_suffix(f'.{idx + 1}.{suffix}'))
示例#16
0
def download_earnings21(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    url: Optional[str] = _DEFAULT_URL,
) -> Path:
    """Download and untar the dataset.
    :param target_dir: Pathlike, the path of the dir to store the dataset.
        The extracted files are saved to target_dir/earnings21/
        Please note that the github repository contains other additional datasets and
        using this call, you will be downloading all of them and then throwing them out.
    :param force_download: Bool, if True, download the tar file no matter
        whether it exists or not.
    :param url: str, the url to download the dataset.
    :return: the path to downloaded and extracted directory with data.
    """
    logging.info(
        "Downloading Earnings21 from github repository is not very efficient way"
        +
        " how to obtain the corpus. You will be downloading other data as well."
    )
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    extracted_dir = target_dir / "earnings21"

    zip_path = target_dir / "speech-datasets-main.zip"

    completed_detector = extracted_dir / ".lhotse-download.completed"
    if completed_detector.is_file():
        logging.info(f"Skipping - {completed_detector} exists.")
        return extracted_dir

    if force_download or not zip_path.is_file():
        urlretrieve_progress(url,
                             filename=zip_path,
                             desc="Getting speech-datasets-main.zip")

    shutil.rmtree(extracted_dir, ignore_errors=True)

    with zipfile.ZipFile(zip_path) as zip:
        for f in zip.namelist():
            if "earnings21" in f:
                zip.extract(f, path=target_dir)

    shutil.move(target_dir / "speech-datasets-main" / "earnings21", target_dir)
    shutil.rmtree(target_dir / "speech-datasets-main")

    completed_detector.touch()

    return extracted_dir
示例#17
0
def download_and_untar(
        target_dir: Pathlike = '.',
        force_download: Optional[bool] = False,
        url: Optional[str] = 'http://www.openslr.org/resources/39') -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = f'LDC2006S37.tar.gz'
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urllib.request.urlretrieve(f'{url}/{tar_name}', filename=tar_path)

    completed_detector = target_dir / '.completed'
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
        completed_detector.touch()
示例#18
0
def download_and_untar(target_dir: Pathlike = '.',
                       force_download: bool = False,
                       url: str = SWBD_TEXT_URL) -> Path:
    target_dir = Path(target_dir)
    transcript_dir = target_dir / 'swb_ms98_transcriptions'
    if transcript_dir.is_dir():
        return transcript_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = 'switchboard_word_alignments.tar.gz'
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urllib.request.urlretrieve(url, filename=tar_path)
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    return transcript_dir
示例#19
0
def import_(data_dir: Pathlike, sampling_rate: int, manifest_dir: Pathlike,
            frame_shift: Seconds):
    """
    Convert a Kaldi data dir DATA_DIR into a directory MANIFEST_DIR of lhotse manifests. Ignores feats.scp.
    The SAMPLING_RATE has to be explicitly specified as it is not available to read from DATA_DIR.
    """
    recording_set, maybe_supervision_set, maybe_feature_set = load_kaldi_data_dir(
        path=data_dir, sampling_rate=sampling_rate, frame_shift=frame_shift)
    manifest_dir = Path(manifest_dir)
    manifest_dir.mkdir(parents=True, exist_ok=True)
    recording_set.to_file(manifest_dir / 'recordings.jsonl.gz')
    if maybe_supervision_set is not None:
        maybe_supervision_set.to_file(manifest_dir / 'supervisions.jsonl.gz')
    if maybe_feature_set is not None:
        maybe_feature_set.to_file(manifest_dir / 'features.jsonl.gz')
示例#20
0
def download_and_untar(target_dir: Pathlike = '.',
                       force_download: Optional[bool] = False) -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_path = target_dir / 'TEDLIUM_release-3.tgz'
    if force_download or not tar_path.is_file():
        urllib.request.urlretrieve(
            'http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz',
            filename=tar_path)
    corpus_dir = target_dir / 'TEDLIUM_release-3.tgz'
    completed_detector = corpus_dir / '.completed'
    if not completed_detector.is_file():
        shutil.rmtree(corpus_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
            completed_detector.touch()
示例#21
0
def download_and_untar(
        target_dir: Pathlike = '.',
        force_download: Optional[bool] = False
) -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_name = 'LJSpeech-1.1'
    tar_path = target_dir / f'{dataset_name}.tar.bz2'
    if force_download or not tar_path.is_file():
        urllib.request.urlretrieve(f'http://data.keithito.com/data/speech/{dataset_name}.tar.bz2', filename=tar_path)
    corpus_dir = target_dir / dataset_name
    completed_detector = corpus_dir / '.completed'
    if not completed_detector.is_file():
        shutil.rmtree(corpus_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
            completed_detector.touch()
示例#22
0
def download_callhome_metadata(
        target_dir: Pathlike = '.',
        force_download: bool = False,
        url: str = "http://www.openslr.org/resources/10/sre2000-key.tar.gz"
) -> Path:
    target_dir = Path(target_dir)
    sre_dir = target_dir / 'sre2000-key'
    if sre_dir.is_dir():
        return sre_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = 'sre2000-key.tar.gz'
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urlretrieve_progress(url, filename=tar_path, desc=f'Downloading {tar_name}')
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    return sre_dir
示例#23
0
def download_and_unzip(
        target_dir: Pathlike = '.',
        force_download: Optional[bool] = False,
        url: Optional[str] = 'https://zenodo.org/record/3871592/files/MiniLibriMix.zip'
) -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    zip_path = target_dir / 'MiniLibriMix.zip'
    if force_download or not zip_path.is_file():
        urlretrieve_progress(url, filename=zip_path, desc='Downloading MiniLibriMix')
    unzipped_dir = target_dir / 'MiniLibriMix'
    completed_detector = unzipped_dir / '.completed'
    if not completed_detector.is_file():
        shutil.rmtree(unzipped_dir, ignore_errors=True)
        with ZipFile(zip_path) as zf:
            zf.extractall(path=target_dir)
            completed_detector.touch()
示例#24
0
def download_and_untar(target_dir: Pathlike = ".",
                       force_download: bool = False,
                       url: str = SWBD_TEXT_URL) -> Path:
    target_dir = Path(target_dir)
    transcript_dir = target_dir / "swb_ms98_transcriptions"
    if transcript_dir.is_dir():
        return transcript_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = "switchboard_word_alignments.tar.gz"
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urlretrieve_progress(url,
                             filename=tar_path,
                             desc=f"Downloading {tar_name}")
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    return transcript_dir
示例#25
0
def download_and_untar(
        target_dir: Pathlike = '.',
        force_download: Optional[bool] = False,
        url: Optional[str] = 'http://www.openslr.org/resources/31') -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    for part in dataset_parts:
        tar_name = f'{part}.tar.gz'
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urllib.request.urlretrieve(f'{url}/{tar_name}', filename=tar_path)
        part_dir = target_dir / f'LibriSpeech/{part}'
        completed_detector = part_dir / '.completed'
        if not completed_detector.is_file():
            shutil.rmtree(part_dir, ignore_errors=True)
            with tarfile.open(tar_path) as tar:
                tar.extractall(path=target_dir)
                completed_detector.touch()
示例#26
0
def split(num_splits: int, manifest: Pathlike, output_dir: Pathlike,
          shuffle: bool):
    """
    Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR.
    """
    from lhotse import load_manifest

    output_dir = Path(output_dir)
    manifest = Path(manifest)
    suffix = "".join(manifest.suffixes)
    any_set = load_manifest(manifest)
    parts = any_set.split(num_splits=num_splits, shuffle=shuffle)
    output_dir.mkdir(parents=True, exist_ok=True)
    num_digits = len(str(num_splits))
    for idx, part in enumerate(parts):
        idx = f"{idx + 1}".zfill(num_digits)
        part.to_file(
            (output_dir / manifest.stem).with_suffix(f".{idx}{suffix}"))
示例#27
0
def download_libritts(
    target_dir: Pathlike = ".",
    dataset_parts: Optional[Union[str, Sequence[str]]] = "all",
    force_download: Optional[bool] = False,
    base_url: Optional[str] = "http://www.openslr.org/resources",
) -> Path:
    """
    Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "librispeech", "mini_librispeech",
        or a list of splits (e.g. "dev-clean") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    if dataset_parts == "all":
        dataset_parts = LIBRITTS

    for part in tqdm(dataset_parts, desc="Downloading LibriSpeech parts"):
        if part not in LIBRITTS:
            logging.warning(f"Skipping invalid dataset part name: {part}")
        url = f"{base_url}/60"
        tar_name = f"{part}.tar.gz"
        tar_path = target_dir / tar_name
        part_dir = target_dir / f"LibriTTS/{part}"
        completed_detector = part_dir / ".completed"
        if completed_detector.is_file():
            logging.info(
                f"Skipping {part} because {completed_detector} exists.")
            continue
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f"{url}/{tar_name}",
                                 filename=tar_path,
                                 desc=f"Downloading {tar_name}")
        shutil.rmtree(part_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
        completed_detector.touch()

    return target_dir
示例#28
0
def download_librispeech(
        target_dir: Pathlike = '.',
        dataset_parts: Optional[Union[str,
                                      Sequence[str]]] = "mini_librispeech",
        force_download: Optional[bool] = False,
        base_url: Optional[str] = 'http://www.openslr.org/resources') -> None:
    """
    Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "librispeech", "mini_librispeech",
        or a list of splits (e.g. "dev-clean") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    if dataset_parts == "librispeech":
        dataset_parts = LIBRISPEECH
    elif dataset_parts == "mini_librispeech":
        dataset_parts = MINI_LIBRISPEECH

    for part in tqdm(dataset_parts, desc='Downloading LibriSpeech parts'):
        if part in LIBRISPEECH:
            url = f'{base_url}/12'
        elif part in MINI_LIBRISPEECH:
            url = f'{base_url}/31'
        else:
            logging.warning(f'Invalid dataset part name: {part}')
            continue
        tar_name = f'{part}.tar.gz'
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f'{url}/{tar_name}',
                                 filename=tar_path,
                                 desc=f'Downloading {tar_name}')
        part_dir = target_dir / f'LibriSpeech/{part}'
        completed_detector = part_dir / '.completed'
        if not completed_detector.is_file():
            shutil.rmtree(part_dir, ignore_errors=True)
            with tarfile.open(tar_path) as tar:
                tar.extractall(path=target_dir)
                completed_detector.touch()
示例#29
0
def fix_(recordings: Pathlike, supervisions: Pathlike, output_dir: Pathlike):
    """
    Fix a pair of Lhotse RECORDINGS and SUPERVISIONS manifests.
    It removes supervisions without corresponding recordings and vice versa,
    trims the supervisions that exceed the recording, etc.
    Stores the output files in OUTPUT_DIR under the same names as the input
    files.
    """
    from lhotse import RecordingSet, SupervisionSet, fix_manifests

    output_dir = Path(output_dir)
    recordings = Path(recordings)
    supervisions = Path(supervisions)
    output_dir.mkdir(parents=True, exist_ok=True)
    recs = RecordingSet.from_file(recordings)
    sups = SupervisionSet.from_file(supervisions)
    recs, sups = fix_manifests(recordings=recs, supervisions=sups)
    recs.to_file(output_dir / recordings.name)
    sups.to_file(output_dir / supervisions.name)
示例#30
0
def download_and_untar_sph2pipe(
    target_dir: Pathlike,
    url: str,
    force_download: bool = False,
) -> Path:
    target_dir = Path(target_dir)
    sph2pipe_dir = target_dir / "sph2pipe-2.5"
    if (sph2pipe_dir / "Makefile").is_file() and not force_download:
        return sph2pipe_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = "sph2pipe-2.5.tar.gz"
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urlretrieve_progress(url,
                             filename=tar_path,
                             desc=f"Downloading {tar_name}")
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    return sph2pipe_dir