예제 #1
0
 def tqdm(self):
     """Add a progression bar for the current extraction."""
     with utils.async_tqdm(total=0,
                           desc='Extraction completed...',
                           unit=' file') as pbar_path:
         self._pbar_path = pbar_path
         yield
예제 #2
0
def download_gcs_dataset(dataset_name,
                         local_dataset_dir,
                         max_simultaneous_downloads=50):
    """Downloads prepared GCS dataset to local dataset directory."""
    prefix = posixpath.join(GCS_DATASETS_DIR, dataset_name)
    gcs_paths_to_dl = gcs_files(prefix)

    # Filter out the diffs folder if present
    filter_prefix = posixpath.join(prefix, "diffs")
    gcs_paths_to_dl = [
        p for p in gcs_paths_to_dl if not p.startswith(filter_prefix)
    ]

    with utils.async_tqdm(total=len(gcs_paths_to_dl),
                          desc="Dl Completed...",
                          unit=" file") as pbar:

        def _copy_from_gcs(gcs_path):
            local_path = posixpath.join(local_dataset_dir,
                                        posixpath.basename(gcs_path))
            download_gcs_file(gcs_path, local_path)
            pbar.update(1)

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=max_simultaneous_downloads) as executor:
            futures = [
                executor.submit(_copy_from_gcs, path)
                for path in gcs_paths_to_dl
            ]
            for future in concurrent.futures.as_completed(futures):
                future.result()