def combine(manifests: Pathlike, output_manifest: Pathlike): """Load MANIFESTS, combine them into a single one, and write it to OUTPUT_MANIFEST.""" from lhotse import load_manifest from lhotse.manipulation import combine as combine_manifests data_set = combine_manifests(*[load_manifest(m) for m in manifests]) data_set.to_file(output_manifest)
def combine(manifests: Pathlike, output_manifest: Pathlike): """Load MANIFESTS, combine them into a single one, and write it to OUTPUT_MANIFEST.""" data_set = combine_manifests(*[load_manifest(m) for m in manifests]) data_set.to_json(output_manifest)
def copy_feats( input_manifest: Pathlike, output_manifest: Pathlike, storage_path: str, storage_type: str, max_jobs: int, ) -> None: """ Load INPUT_MANIFEST of type :class:`lhotse.FeatureSet` or `lhotse.CutSet`, read every feature matrix using ``features.load()`` or ``cut.load_features()``, save them in STORAGE_PATH and save the updated manifest to OUTPUT_MANIFEST. """ from lhotse.serialization import load_manifest_lazy_or_eager from lhotse.manipulation import combine as combine_manifests manifests = load_manifest_lazy_or_eager(input_manifest) if isinstance(manifests, FeatureSet): with get_writer(storage_type)(storage_path) as w: # FeatureSet is copied in-memory and written (TODO: make it incremental if needed) manifests = manifests.copy_feats(writer=w) manifests.to_file(output_manifest) elif isinstance(manifests, CutSet): # Group cuts by their underlying feature files. manifests = sorted(manifests, key=lambda cut: cut.features.storage_path) subsets = groupby(manifests, lambda cut: cut.features.storage_path) unique_storage_paths, subsets = zip(*[(k, CutSet.from_cuts(grp)) for k, grp in subsets]) # Create paths for new feature files and subset cutsets. tot_items = len(unique_storage_paths) new_storage_paths = [ f"{storage_path}/feats-{i}" for i in range(tot_items) ] partial_manifest_paths = [ f"{storage_path}/cuts-{i}.jsonl.gz" for i in range(tot_items) ] num_jobs = len(unique_storage_paths) if max_jobs > 0: num_jobs = min(num_jobs, max_jobs) # Create directory if needed (storage_path might be an URL) if Path(storage_path).parent.is_dir(): Path(storage_path).mkdir(exist_ok=True) # Copy each partition in parallel and combine lazily opened manifests. with ProcessPoolExecutor(num_jobs) as ex: futures = [] for cs, nsp, pmp in zip(subsets, new_storage_paths, partial_manifest_paths): futures.append( ex.submit(copy_feats_worker, cs, nsp, storage_type, pmp)) all_cuts = combine_manifests( (f.result() for f in as_completed(futures))) # Combine and save subset cutsets into the final file. with CutSet.open_writer(output_manifest) as w: for c in all_cuts: w.write(c) else: raise ValueError( f"Unsupported manifest type ({type(manifests)}) at: {input_manifest}" )