示例#1
0
def download_gcs_dataset(dataset_name,
                         local_dataset_dir,
                         max_simultaneous_downloads=50):
    """Downloads prepared GCS dataset to local dataset directory."""
    prefix = posixpath.join(GCS_DATASETS_DIR, dataset_name)
    gcs_paths_to_dl = gcs_files(prefix)

    # Filter out the diffs folder if present
    filter_prefix = posixpath.join(prefix, 'diffs')
    gcs_paths_to_dl = [
        p for p in gcs_paths_to_dl if not p.startswith(filter_prefix)
    ]

    with tqdm_utils.async_tqdm(total=len(gcs_paths_to_dl),
                               desc='Dl Completed...',
                               unit=' file') as pbar:

        def _copy_from_gcs(gcs_path_):
            local_path = posixpath.join(local_dataset_dir,
                                        posixpath.basename(gcs_path_))
            download_gcs_file(gcs_path_, local_path)
            pbar.update(1)

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=max_simultaneous_downloads) as executor:
            futures = [
                executor.submit(_copy_from_gcs, path)
                for path in gcs_paths_to_dl
            ]
            for future in concurrent.futures.as_completed(futures):
                future.result()
示例#2
0
def download_gcs_dataset(
    dataset_name, local_dataset_dir, max_simultaneous_downloads=25
):
  """Downloads prepared GCS dataset to local dataset directory."""
  if _is_gcs_disabled:
    raise AssertionError('Cannot download from GCS when _is_gcs_disabled')

  prefix = posixpath.join(GCS_DATASETS_DIR, dataset_name)
  gcs_paths_to_dl = gcs_listdir(prefix)

  # Filter out the diffs folder if present
  filter_prefix = posixpath.join(prefix, 'diffs')
  gcs_paths_to_dl = [
      p for p in gcs_paths_to_dl if not p.startswith(filter_prefix)
  ]

  with tqdm_utils.async_tqdm(
      total=len(gcs_paths_to_dl), desc='Dl Completed...', unit=' file') as pbar:

    def _copy_from_gcs(gcs_path_):
      # Copy 'gs://tfds-data/datasets/ds/1.0.0/file' -> `local_dir/file`
      tf.io.gfile.copy(
          os.fspath(gcs_path(gcs_path_)),
          os.path.join(local_dataset_dir, posixpath.basename(gcs_path_)),
      )
      pbar.update(1)

    with concurrent.futures.ThreadPoolExecutor(
        max_workers=max_simultaneous_downloads) as executor:
      futures = [
          executor.submit(_copy_from_gcs, path) for path in gcs_paths_to_dl
      ]
      for future in concurrent.futures.as_completed(futures):
        future.result()
示例#3
0
def download_gcs_folder(
    gcs_folder: epath.Path,
    local_folder: epath.PathLike,
    max_simultaneous_downloads: int = 25,
) -> None:
    """Downloads prepared GCS folder to local folder."""
    if _is_gcs_disabled:
        raise AssertionError('Cannot download from GCS when _is_gcs_disabled')

    # Filter out the diffs folder if present
    paths_to_dl = [p for p in gcs_folder.iterdir() if p.name != 'diffs']

    with tqdm_utils.async_tqdm(total=len(paths_to_dl),
                               desc='Dl Completed...',
                               unit=' file') as pbar:

        def _copy(gcs_path_: epath.Path):
            # Copy 'gs://tfds-data/datasets/ds/1.0.0/file' -> `local_dir/file`
            tf.io.gfile.copy(
                os.fspath(gcs_path_),
                os.path.join(local_folder, gcs_path_.name),
            )
            pbar.update(1)

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=max_simultaneous_downloads) as executor:
            futures = [executor.submit(_copy, path) for path in paths_to_dl]
            for future in concurrent.futures.as_completed(futures):
                future.result()