def save(xx, location, product_name, verbose): client = start_local_dask( nanny=False, n_workers=1, threads_per_worker=8, mem_safety_margin="0G", processes=False, ) gdal_cfg = {"GDAL_CACHEMAX": 8 * (1 << 30)} configure_s3_access(aws_unsigned=True, cloud_defaults=True, **gdal_cfg) configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client, **gdal_cfg) rgba = to_rgba(xx.isel(time=0), clamp=(0, 3000)) rgba = xr_to_mem(rgba, client) if verbose: print(f"Writing {location}/{product_name}.tif") write_cog( rgba, f"{location}/{product_name}.tif", blocksize=1024, compress="zstd", zstd_level=4, overview_levels=[], NUM_THREADS="ALL_CPUS", BIGTIFF="YES", SPARSE_OK=True, )
def _init_dask(self) -> Client: cfg = self._cfg _log = self._log nthreads = cfg.threads if nthreads <= 0: nthreads = get_max_cpu() memory_limit: Union[str, int] = cfg.memory_limit if memory_limit == "": _1G = 1 << 30 memory_limit = get_max_mem() if memory_limit > 2 * _1G: # leave at least a gig extra if total mem more than 2G memory_limit -= _1G client = start_local_dask( threads_per_worker=nthreads, processes=False, memory_limit=memory_limit ) aws_unsigned = self._cfg.aws_unsigned for c in (None, client): configure_s3_access( aws_unsigned=aws_unsigned, cloud_defaults=True, client=c ) _log.info(f"Started local Dask {client}") return client
def create_local_dask_cluster(spare_mem='3Gb', display_client=True): """ Using the datacube utils function `start_local_dask`, generate a local dask cluster. Automatically detects if on AWS or NCI. Example use : import sys sys.path.append("../Scripts") from dea_dask import create_local_dask_cluster create_local_dask_cluster(spare_mem='4Gb') Parameters ---------- spare_mem : String, optional The amount of memory, in Gb, to leave for the notebook to run. This memory will not be used by the cluster. e.g '3Gb' display_client : Bool, optional An optional boolean indicating whether to display a summary of the dask client, including a link to monitor progress of the analysis. Set to False to hide this display. """ if 'AWS_ACCESS_KEY_ID' in os.environ: # Close previous client if any client = locals().get('client', None) if client is not None: client.close() del client # Configure dashboard link to go over proxy dask.config.set({"distributed.dashboard.link": os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')+"proxy/{port}/status"}) # Start up a local cluster client = start_local_dask(mem_safety_margin=spare_mem) ## Configure GDAL for s3 access configure_s3_access(aws_unsigned=True, client=client); else: # Close previous client if any client = locals().get('client', None) if client is not None: client.close() del client # Start up a local cluster on NCI client = start_local_dask(mem_safety_margin=spare_mem) # Show the dask cluster settings if display_client: display(client)
def create_local_dask_cluster(spare_mem='3Gb', aws_unsigned= True, display_client=True, **kwargs): """ Using the datacube utils function 'start_local_dask', generate a local dask cluster. Example use : import sys sys.path.append("../Scripts") from deafrica_dask import create_local_dask_cluster create_local_dask_cluster(spare_mem='4Gb') Parameters ---------- spare_mem : String, optional The amount of memory, in Gb, to leave for the notebook to run. This memory will not be used by the cluster. e.g '3Gb' aws_unsigned : Bool, optional This parameter determines if credentials for S3 access are required and passes them on to processing threads, either local or on dask cluster. Set to True if working with publicly available datasets, and False if working with private data. i.e if loading Landsat C2 provisional data set this to aws_unsigned=False display_client : Bool, optional An optional boolean indicating whether to display a summary of the dask client, including a link to monitor progress of the analysis. Set to False to hide this display. **kwargs: Additional keyword arguments that will be passed to start_local_dask(). E.g. n_workers can be set to be greater than 1. """ # configure dashboard link to go over proxy dask.config.set({"distributed.dashboard.link": os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')+"proxy/{port}/status"}) # start up a local cluster client = start_local_dask(mem_safety_margin=spare_mem, **kwargs) ## Configure GDAL for s3 access configure_s3_access(aws_unsigned=aws_unsigned, client=client); # Show the dask cluster settings if display_client: display(client)
def __init__(self): # Set some Celery env vars. # Tried (1) sourcing `/var/www/[.profile|.bashrc]` in `/etc/init.d/data_cube_ui`, # (2) writing to `/etc/default/[data_cube_ui|celerybeat]`, and # (3) writing a Python file to define Celery environment variables to `exec` in Celery code. # All were unsuccessful. # TODO: This should run on worker init, not when running a task. if '/miniconda/envs/odc/bin' not in os.environ['PATH']: os.environ[ 'PATH'] = '/miniconda/envs/odc/bin:' + os.environ['PATH'] # Configure ODC to load from requester-pays S3 buckets. if os.environ.get('AWS_ACCESS_KEY_ID') is not None and \ os.environ.get('AWS_SECRET_ACCESS_KEY') is not None: configure_s3_access(requester_pays=True)
def test_rio_configure_aws_access(monkeypatch, without_aws_env, dask_client): monkeypatch.setenv("AWS_ACCESS_KEY_ID", "fake-key-id") monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "fake-secret") monkeypatch.setenv("AWS_DEFAULT_REGION", "fake-region") creds = configure_s3_access() cc = creds.get_frozen_credentials() assert cc.access_key == 'fake-key-id' assert cc.secret_key == 'fake-secret' assert cc.token is None ee = activate_from_config() assert ee is not None assert 'AWS_ACCESS_KEY_ID' in ee assert 'AWS_SECRET_ACCESS_KEY' in ee assert 'AWS_REGION' in ee assert 'AWS_SESSION_TOKEN' not in ee ee = get_rio_env(sanitize=False) assert ee is not None assert ee['AWS_ACCESS_KEY_ID'] == 'fake-key-id' assert ee['AWS_SECRET_ACCESS_KEY'] == 'fake-secret' assert ee['AWS_REGION'] == 'fake-region' assert ee['GDAL_DISABLE_READDIR_ON_OPEN'] == 'EMPTY_DIR' ee_local = ee client = dask_client creds = configure_s3_access(client=client) cc = creds.get_frozen_credentials() assert cc.access_key == 'fake-key-id' assert cc.secret_key == 'fake-secret' assert cc.token is None ee = client.submit(activate_from_config).result() assert ee is not None assert 'AWS_ACCESS_KEY_ID' in ee assert 'AWS_SECRET_ACCESS_KEY' in ee assert 'AWS_REGION' in ee assert 'AWS_SESSION_TOKEN' not in ee def _activate_and_get(sanitize=True): activate_from_config() return get_rio_env(sanitize=sanitize) ee = client.submit(_activate_and_get, sanitize=False).result() assert ee == ee_local
def query_handler(self, time, feature, dask_client=None, parameters=None): if parameters is None: parameters = {} if dask_client is None: dask_client = Client( n_workers=num_workers(), processes=True, threads_per_worker=1 ) with dask_client: configure_s3_access( aws_unsigned=True, region_name=os.getenv("AWS_DEFAULT_REGION", "auto"), client=dask_client, ) with datacube.Datacube() as dc: data = self.input_data(dc, time, feature) df = self.process_data(data, {"time": time, "feature": feature, **parameters}) chart = self.render_chart(df) return {"data": df, "chart": chart}
def __init__( self, config: Optional[FeaturePathConfig] = None, geobox_dict: Optional[Dict] = None, client: Optional[Client] = None, ): self.config = config if config else FeaturePathConfig() self.geobox_dict = geobox_dict if not client: nthreads = get_max_cpu() memory_limit = get_max_mem() client = start_local_dask( threads_per_worker=nthreads, processes=False, memory_limit=int(0.9 * memory_limit), ) configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client) self.client = client setup_logging() self._log = logging.getLogger(__name__)
from IPython.display import display, Image import matplotlib.pyplot as plt plt.rcParams["axes.facecolor"] = "magenta" # makes transparent pixels obvious import numpy as np import xarray as xr # %% from dask.distributed import Client, wait as dask_wait from datacube.utils.dask import start_local_dask from datacube.utils.rio import configure_s3_access if False: client = start_local_dask(scheduler_port=11311, threads_per_worker=16) configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client) else: client = Client("tcp://127.0.0.1:11311") client.restart() client # %% from odc.algo import to_rgba from odc.ui import to_jpeg_data def mk_roi(y, x, sz=256): return np.s_[y : y + sz, x : x + sz]
def run_gm(cache_file, tasks, dryrun, verbose, threads, x_chunks, y_chunks, overwrite, public, location): """ Run Pixel Quality stats Task could be one of the 3 things \b 1. Comma-separated triplet: period,x,y or 'x[+-]<int>/y[+-]<int>/period 2019--P1Y,+003,-004 2019--P1Y/3/-4 `/` is also accepted x+003/y-004/2019--P1Y is accepted as well 2. A zero based index 3. A slice following python convention <start>:<stop>[:<step] ::10 -- every tenth task: 0,10,20,.. 1::10 -- every tenth but skip first one 1, 11, 21 .. :100 -- first 100 tasks If no tasks are supplied the whole file will be processed. """ from tqdm.auto import tqdm from functools import partial import dask import psutil from .io import S3COGSink from ._gm import gm_input_data, gm_reduce, gm_product from .proc import process_tasks from .tasks import TaskReader from datacube.utils.dask import start_local_dask from datacube.utils.rio import configure_s3_access dask.config.set({'distributed.worker.memory.target': False}) dask.config.set({'distributed.worker.memory.spill': False}) dask.config.set({'distributed.worker.memory.pause': False}) dask.config.set({'distributed.worker.memory.terminate': False}) # config resampling = 'bilinear' COG_OPTS = dict( compress='deflate', predict=2, zlevel=6, blocksize=800, ovr_blocksize= 256, # ovr_blocksize must be powers of 2 for some reason in GDAL overview_resampling='bilinear') ncpus = psutil.cpu_count() # .. if threads <= 0: threads = ncpus rdr = TaskReader(cache_file) product = gm_product(location=location) if verbose: print(repr(rdr)) def _proc(task): NY, NX = task.geobox.shape ds_in = gm_input_data(task, resampling=resampling, chunk=(NY // y_chunks, NX)) tdim = list(ds_in.dims)[0] ds_in = ds_in.chunk({tdim: -1, 'x': NX // x_chunks}) ds = gm_reduce(ds_in, num_threads=ncpus // x_chunks + 2, wk_rows=(NY // y_chunks) // 4, as_array=True) return ds def dry_run_proc(task, sink, check_s3=False): uri = sink.uri(task) exists = None if check_s3: exists = sink.exists(task) nds = len(task.datasets) ndays = len(set(ds.center_time.date() for ds in task.datasets)) if overwrite: flag = {None: '', True: ' (recompute)', False: ' (new)'}[exists] else: flag = {None: '', True: ' (skip)', False: ' (new)'}[exists] task_id = f"{task.short_time}/{task.tile_index[0]:+05d}/{task.tile_index[1]:+05d}" print(f"{task_id} days={ndays:03} ds={nds:04} {uri}{flag}") return uri if len(tasks) == 0: tasks = rdr.all_tiles if verbose: print(f"Found {len(tasks):,d} tasks in the file") else: try: tasks = parse_all_tasks(tasks, rdr.all_tiles) except ValueError as e: print(str(e), file=sys.stderr) sys.exit(1) if verbose: print(f"Will process {len(tasks):,d} tasks") sink = S3COGSink(cog_opts=COG_OPTS, public=public) if product.location.startswith('s3:'): if not sink.verify_s3_credentials(): print("Failed to load S3 credentials") sys.exit(2) if verbose and sink._creds: creds_rw = sink._creds print( f'creds: ..{creds_rw.access_key[-5:]} ..{creds_rw.secret_key[-5:]}' ) _tasks = rdr.stream(tasks, product) client = None if not dryrun: if verbose: print("Starting local Dask cluster") client = start_local_dask(threads_per_worker=threads, mem_safety_margin='1G') # TODO: aws_unsigned is not always desirable configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client) if verbose: print(client) if dryrun: results = map(partial(dry_run_proc, sink=sink, check_s3=not overwrite), _tasks) else: results = process_tasks(_tasks, _proc, client, sink, check_exists=not overwrite, chunked_persist=x_chunks, verbose=verbose) if not dryrun and verbose: results = tqdm(results, total=len(tasks)) for p in results: if verbose and not dryrun: print(p) if verbose: print("Exiting") if client is not None: client.close()
def create_local_dask_cluster(spare_mem='3Gb', aws_unsigned=True, display_client=True, start_local_dask_kwargs=None, configure_s3_access_kwargs=None): """ Credit belongs to Digital Earth Africa: https://github.com/digitalearthafrica/deafrica-sandbox-notebooks/blob/master/Scripts/deafrica_dask.py Using the datacube utils function 'start_local_dask', generate a local dask cluster. Example use : import sys sys.path.append("../Scripts") from deafrica_dask import create_local_dask_cluster create_local_dask_cluster(spare_mem='4Gb') Parameters ---------- spare_mem : String, optional The amount of memory, in Gb, to leave for the notebook to run. This memory will not be used by the cluster. e.g '3Gb' aws_unsigned : Bool, optional This parameter determines if credentials for S3 access are required and passes them on to processing threads, either local or on dask cluster. Set to True if working with publicly available datasets, and False if working with private data. i.e if loading Landsat C2 provisional data set this to aws_unsigned=False display_client : Bool, optional An optional boolean indicating whether to display a summary of the dask client, including a link to monitor progress of the analysis. Set to False to hide this display. start_local_dask_kwargs: dict, optional Keyword arguments for the function `datacube.utils.dask.start_local_dask`, which creates the Dask client. Some settings to configure include the number of workers, number of threads per worker, and the memory limit. configure_s3_access_kwargs: dict, optional Keyword arguments for the function `datacube.utils.rio.configure_s3_access`, which configures the Dask to access S3. """ start_local_dask_kwargs = {} if start_local_dask_kwargs is None else start_local_dask_kwargs configure_s3_access_kwargs = {} if configure_s3_access_kwargs is None else configure_s3_access_kwargs # configure dashboard link to go over proxy dask.config.set({ "distributed.dashboard.link": os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/') + "proxy/{port}/status" }) # start up a local cluster num_physical_cpu = psutil.cpu_count(logical=False) num_logical_cpu = psutil.cpu_count(logical=True) num_logical_per_physical = num_logical_cpu / num_physical_cpu start_local_dask_kwargs.setdefault('n_workers', num_physical_cpu - 1) start_local_dask_kwargs.setdefault( 'threads_per_worker', int(num_logical_per_physical * start_local_dask_kwargs['n_workers'])) client = start_local_dask(mem_safety_margin=spare_mem, **start_local_dask_kwargs) ## Configure GDAL for s3 access configure_s3_access(aws_unsigned=aws_unsigned, client=client, **configure_s3_access_kwargs) return client
def run_pq(cache_file, tasks, dryrun, verbose, threads, overwrite, public, location): """ Run Pixel Quality stats Task could be one of the 3 things \b 1. Comma-separated triplet: period,x,y or 'x[+-]<int>/y[+-]<int>/period 2019--P1Y,+003,-004 2019--P1Y/3/-4 `/` is also accepted x+003/y-004/2019--P1Y is accepted as well 2. A zero based index 3. A slice following python convention <start>:<stop>[:<step] ::10 -- every tenth task: 0,10,20,.. 1::10 -- every tenth but skip first one 1, 11, 21 .. :100 -- first 100 tasks If no tasks are supplied the whole file will be processed. """ from tqdm.auto import tqdm from functools import partial from .io import S3COGSink from ._pq import pq_input_data, pq_reduce, pq_product from .proc import process_tasks from .tasks import TaskReader from datacube.utils.dask import start_local_dask from datacube.utils.rio import configure_s3_access # config resampling = 'nearest' COG_OPTS = dict(compress='deflate', predict=2, zlevel=6, blocksize=800) # .. rdr = TaskReader(cache_file) product = pq_product(location=location) if verbose: print(repr(rdr)) def pq_proc(task): ds_in = pq_input_data(task, resampling=resampling) ds = pq_reduce(ds_in) return ds def dry_run_proc(task, sink, check_s3=False): uri = sink.uri(task) exists = None if check_s3: exists = sink.exists(task) nds = len(task.datasets) ndays = len(set(ds.center_time.date() for ds in task.datasets)) if overwrite: flag = {None: '', True: ' (recompute)', False: ' (new)'}[exists] else: flag = {None: '', True: ' (skip)', False: ' (new)'}[exists] task_id = f"{task.short_time}/{task.tile_index[0]:+05d}/{task.tile_index[1]:+05d}" print(f"{task_id} days={ndays:03} ds={nds:04} {uri}{flag}") return uri if len(tasks) == 0: tasks = rdr.all_tiles if verbose: print(f"Found {len(tasks):,d} tasks in the file") else: try: tasks = parse_all_tasks(tasks, rdr.all_tiles) except ValueError as e: print(str(e), file=sys.stderr) sys.exit(1) if verbose: print(f"Will process {len(tasks):,d} tasks") sink = S3COGSink(cog_opts=COG_OPTS, public=public) if product.location.startswith('s3:'): if not sink.verify_s3_credentials(): print("Failed to load S3 credentials") sys.exit(2) if verbose and sink._creds: creds_rw = sink._creds print( f'creds: ..{creds_rw.access_key[-5:]} ..{creds_rw.secret_key[-5:]}' ) _tasks = rdr.stream(tasks, product) client = None if not dryrun: if verbose: print("Starting local Dask cluster") client = start_local_dask(threads_per_worker=threads, mem_safety_margin='1G') # TODO: aws_unsigned is not always desirable configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client) if verbose: print(client) if dryrun: results = map(partial(dry_run_proc, sink=sink, check_s3=not overwrite), _tasks) else: results = process_tasks(_tasks, pq_proc, client, sink, check_exists=not overwrite, verbose=verbose) if not dryrun and verbose: results = tqdm(results, total=len(tasks)) for p in results: if verbose and not dryrun: print(p) if verbose: print("Exiting") if client is not None: client.close()