def flow(self): if len(self.targets) == 1: target = self.targets[0] else: raise ValueError( "Zarr target requires self.targets be a length one list") with Flow(self.name, storage=self.storage, environment=self.environment) as _flow: # download to cache nc_sources = download.map( self.sources, cache_location=unmapped(self.cache_location), ) # convert cached netcdf data to zarr cached_sources = nc2zarr.map( nc_sources, cache_location=unmapped(self.cache_location), ) # combine all datasets into a single zarr archive combine_and_write(cached_sources, target) return _flow
def flow(self): with Flow(self.name) as _flow: sources = source_url.map(self.days) nc_sources = download.map(sources, cache_location=unmapped( self.cache_location)) chunked = chunk(nc_sources, size=self.files_per_chunk) writes = combine_and_write.map(chunked, unmapped(self.target_location), unmapped(self.concat_dim)) consolidate_metadata(writes, self.target_location) return _flow
def flow(self): with Flow(self.name) as flow: # Use map the `source_url` task to each day. This returns a mapped output, # a list of string URLS. See # https://docs.prefect.io/core/concepts/mapping.html#prefect-approach # for more. We'll have one output URL per day. sources = source_url.map(self.days) # Map the `download` task (provided by prefect) to download the raw data # into a cache. # Mapped outputs (sources) can be fed straight into another Task.map call. # If an input is just a regular argument that's not a mapping, it must # be wrapepd in `prefect.unmapped`. # https://docs.prefect.io/core/concepts/mapping.html#unmapped-inputs # nc_sources will be a list of cached URLs, one per input day. nc_sources = download.map(sources, cache_location=unmapped( self.cache_location)) # The individual files would be a bit too small for analysis. We'll use # pangeo_forge.utils.chunk to batch them up. We can pass mapped outputs # like nc_sources directly to `chunk`. chunked = pangeo_forge.utils.chunk(nc_sources, size=5) # Combine all the chunked inputs and write them to their final destination. writes = combine_and_write.map( chunked, unmapped(self.target_location), append_dim=unmapped("time"), concat_dim=unmapped("time"), ) # Consolidate the metadata for the final dataset. consolidate_metadata(self.target_location, writes=writes) return flow
def flow(self): with Flow(self.name, storage=self.storage, environment=self.environment) as _flow: # download to cache nc_sources = download.map( self.sources, cache_location=unmapped(self.cache_location), ) first = True write_tasks = [] for source_group in chunked_iterable(nc_sources, self.files_per_chunk): write_task = combine_and_write(source_group, self.target_location, self.concat_dim, first=first) write_tasks.append(write_task) first = False cm = consolidate_metadata(target_path) return _flow