def download_gcs_dataset(dataset_name, local_dataset_dir, max_simultaneous_downloads=50): """Downloads prepared GCS dataset to local dataset directory.""" prefix = posixpath.join(GCS_DATASETS_DIR, dataset_name) gcs_paths_to_dl = gcs_files(prefix) # Filter out the diffs folder if present filter_prefix = posixpath.join(prefix, 'diffs') gcs_paths_to_dl = [ p for p in gcs_paths_to_dl if not p.startswith(filter_prefix) ] with tqdm_utils.async_tqdm(total=len(gcs_paths_to_dl), desc='Dl Completed...', unit=' file') as pbar: def _copy_from_gcs(gcs_path_): local_path = posixpath.join(local_dataset_dir, posixpath.basename(gcs_path_)) download_gcs_file(gcs_path_, local_path) pbar.update(1) with concurrent.futures.ThreadPoolExecutor( max_workers=max_simultaneous_downloads) as executor: futures = [ executor.submit(_copy_from_gcs, path) for path in gcs_paths_to_dl ] for future in concurrent.futures.as_completed(futures): future.result()
def download_gcs_dataset( dataset_name, local_dataset_dir, max_simultaneous_downloads=25 ): """Downloads prepared GCS dataset to local dataset directory.""" if _is_gcs_disabled: raise AssertionError('Cannot download from GCS when _is_gcs_disabled') prefix = posixpath.join(GCS_DATASETS_DIR, dataset_name) gcs_paths_to_dl = gcs_listdir(prefix) # Filter out the diffs folder if present filter_prefix = posixpath.join(prefix, 'diffs') gcs_paths_to_dl = [ p for p in gcs_paths_to_dl if not p.startswith(filter_prefix) ] with tqdm_utils.async_tqdm( total=len(gcs_paths_to_dl), desc='Dl Completed...', unit=' file') as pbar: def _copy_from_gcs(gcs_path_): # Copy 'gs://tfds-data/datasets/ds/1.0.0/file' -> `local_dir/file` tf.io.gfile.copy( os.fspath(gcs_path(gcs_path_)), os.path.join(local_dataset_dir, posixpath.basename(gcs_path_)), ) pbar.update(1) with concurrent.futures.ThreadPoolExecutor( max_workers=max_simultaneous_downloads) as executor: futures = [ executor.submit(_copy_from_gcs, path) for path in gcs_paths_to_dl ] for future in concurrent.futures.as_completed(futures): future.result()
def download_gcs_folder( gcs_folder: epath.Path, local_folder: epath.PathLike, max_simultaneous_downloads: int = 25, ) -> None: """Downloads prepared GCS folder to local folder.""" if _is_gcs_disabled: raise AssertionError('Cannot download from GCS when _is_gcs_disabled') # Filter out the diffs folder if present paths_to_dl = [p for p in gcs_folder.iterdir() if p.name != 'diffs'] with tqdm_utils.async_tqdm(total=len(paths_to_dl), desc='Dl Completed...', unit=' file') as pbar: def _copy(gcs_path_: epath.Path): # Copy 'gs://tfds-data/datasets/ds/1.0.0/file' -> `local_dir/file` tf.io.gfile.copy( os.fspath(gcs_path_), os.path.join(local_folder, gcs_path_.name), ) pbar.update(1) with concurrent.futures.ThreadPoolExecutor( max_workers=max_simultaneous_downloads) as executor: futures = [executor.submit(_copy, path) for path in paths_to_dl] for future in concurrent.futures.as_completed(futures): future.result()