예제 #1
0
def combine(manifests: Pathlike, output_manifest: Pathlike):
    """Load MANIFESTS, combine them into a single one, and write it to OUTPUT_MANIFEST."""
    from lhotse import load_manifest
    from lhotse.manipulation import combine as combine_manifests

    data_set = combine_manifests(*[load_manifest(m) for m in manifests])
    data_set.to_file(output_manifest)
예제 #2
0
def combine(manifests: Pathlike, output_manifest: Pathlike):
    """Load MANIFESTS, combine them into a single one, and write it to OUTPUT_MANIFEST."""
    data_set = combine_manifests(*[load_manifest(m) for m in manifests])
    data_set.to_json(output_manifest)
예제 #3
0
def copy_feats(
    input_manifest: Pathlike,
    output_manifest: Pathlike,
    storage_path: str,
    storage_type: str,
    max_jobs: int,
) -> None:
    """
    Load INPUT_MANIFEST of type :class:`lhotse.FeatureSet` or `lhotse.CutSet`,
    read every feature matrix using ``features.load()`` or ``cut.load_features()``,
    save them in STORAGE_PATH and save the updated manifest to OUTPUT_MANIFEST.
    """
    from lhotse.serialization import load_manifest_lazy_or_eager
    from lhotse.manipulation import combine as combine_manifests

    manifests = load_manifest_lazy_or_eager(input_manifest)

    if isinstance(manifests, FeatureSet):
        with get_writer(storage_type)(storage_path) as w:
            # FeatureSet is copied in-memory and written (TODO: make it incremental if needed)
            manifests = manifests.copy_feats(writer=w)
            manifests.to_file(output_manifest)

    elif isinstance(manifests, CutSet):
        # Group cuts by their underlying feature files.
        manifests = sorted(manifests,
                           key=lambda cut: cut.features.storage_path)
        subsets = groupby(manifests, lambda cut: cut.features.storage_path)
        unique_storage_paths, subsets = zip(*[(k, CutSet.from_cuts(grp))
                                              for k, grp in subsets])

        # Create paths for new feature files and subset cutsets.
        tot_items = len(unique_storage_paths)
        new_storage_paths = [
            f"{storage_path}/feats-{i}" for i in range(tot_items)
        ]
        partial_manifest_paths = [
            f"{storage_path}/cuts-{i}.jsonl.gz" for i in range(tot_items)
        ]

        num_jobs = len(unique_storage_paths)
        if max_jobs > 0:
            num_jobs = min(num_jobs, max_jobs)

        # Create directory if needed (storage_path might be an URL)
        if Path(storage_path).parent.is_dir():
            Path(storage_path).mkdir(exist_ok=True)

        # Copy each partition in parallel and combine lazily opened manifests.
        with ProcessPoolExecutor(num_jobs) as ex:
            futures = []
            for cs, nsp, pmp in zip(subsets, new_storage_paths,
                                    partial_manifest_paths):
                futures.append(
                    ex.submit(copy_feats_worker, cs, nsp, storage_type, pmp))

            all_cuts = combine_manifests(
                (f.result() for f in as_completed(futures)))

        # Combine and save subset cutsets into the final file.
        with CutSet.open_writer(output_manifest) as w:
            for c in all_cuts:
                w.write(c)
    else:
        raise ValueError(
            f"Unsupported manifest type ({type(manifests)}) at: {input_manifest}"
        )