def flow(self): with Flow(self.name, storage=self.storage, environment=self.environment) as _flow: # download to cache nc_sources = [ download(x, cache_location=self.cache_location) for x in self.sources ] first = True write_tasks = [] for source_group in chunked_iterable(nc_sources, self.files_per_chunk): write_task = combine_and_write(source_group, self.target_location, self.concat_dim, first=first) write_tasks.append(write_task) first = False cm = consolidate_metadata(target_location) print(cm) return _flow
def main(): url_list = cmr_search( short_name, version, time_start, time_end, bounding_box=bounding_box, polygon=polygon, filename_filter=filename_filter, ) credentials = get_credentials(url_list[0]) tasks = [] for url_group in chunked_iterable(url_list, 100): tasks.append( cmr_download( url_group, "gs://carbonplan-scratch/glas-cache/", credentials=credentials, )) dask.compute(tasks, retries=3, scheduler="single-threaded")
@task def consolidate_metadata(target): mapper = fsspec.get_mapper(target) zarr.consolidate_metadata(mapper) keys = pd.date_range("1981-09-01", "1981-09-10", freq="D") target_path = "gs://pangeo-scratch/rabernat/pangeo_smithy/oisst-avhrr-v02r01-target/" concat_dim = "time" files_per_chunk = 5 with Flow("Pangeo-Forge") as flow: sources = [download(k) for k in keys] first = True write_tasks = [] for source_group in chunked_iterable(sources, files_per_chunk): write_task = combine_and_write(source_group, target_path, concat_dim, first=first) write_tasks.append(write_task) first = False cm = consolidate_metadata(target_path) # create dependencies in imperative mode for n in range(1, len(write_tasks)): write_tasks[n].set_upstream(write_tasks[n - 1], flow=flow) cm.set_upstream(write_tasks[-1], flow=flow) # DO IT! # flow.run()
def chunk(sources, size): # TODO: move to pangeo_forge. return list(chunked_iterable(sources, size))
def test_chunked_iterable(iterable, size, expected): actual = list(chunked_iterable(iterable, size)) assert actual == expected