Python configure_s3_access 예제들, datacube.utils.rio.configure_s3_access Python 예제들

예제 #1

0

파일 보기

def save(xx, location, product_name, verbose):
    client = start_local_dask(
        nanny=False,
        n_workers=1,
        threads_per_worker=8,
        mem_safety_margin="0G",
        processes=False,
    )

    gdal_cfg = {"GDAL_CACHEMAX": 8 * (1 << 30)}
    configure_s3_access(aws_unsigned=True, cloud_defaults=True, **gdal_cfg)
    configure_s3_access(aws_unsigned=True,
                        cloud_defaults=True,
                        client=client,
                        **gdal_cfg)

    rgba = to_rgba(xx.isel(time=0), clamp=(0, 3000))
    rgba = xr_to_mem(rgba, client)

    if verbose:
        print(f"Writing {location}/{product_name}.tif")

    write_cog(
        rgba,
        f"{location}/{product_name}.tif",
        blocksize=1024,
        compress="zstd",
        zstd_level=4,
        overview_levels=[],
        NUM_THREADS="ALL_CPUS",
        BIGTIFF="YES",
        SPARSE_OK=True,
    )

예제 #2

0

파일 보기

    def _init_dask(self) -> Client:
        cfg = self._cfg
        _log = self._log

        nthreads = cfg.threads
        if nthreads <= 0:
            nthreads = get_max_cpu()

        memory_limit: Union[str, int] = cfg.memory_limit
        if memory_limit == "":
            _1G = 1 << 30
            memory_limit = get_max_mem()
            if memory_limit > 2 * _1G:
                # leave at least a gig extra if total mem more than 2G
                memory_limit -= _1G

        client = start_local_dask(
            threads_per_worker=nthreads, processes=False, memory_limit=memory_limit
        )
        aws_unsigned = self._cfg.aws_unsigned
        for c in (None, client):
            configure_s3_access(
                aws_unsigned=aws_unsigned, cloud_defaults=True, client=c
            )
        _log.info(f"Started local Dask {client}")

        return client

예제 #3

0

파일 보기

파일: dea_dask.py 프로젝트: StefBNHCRC/dea-notebooks

def create_local_dask_cluster(spare_mem='3Gb', display_client=True):
    """
    Using the datacube utils function `start_local_dask`, generate
    a local dask cluster. Automatically detects if on AWS or NCI.
    
    Example use :
        
        import sys
        sys.path.append("../Scripts")
        from dea_dask import create_local_dask_cluster
        
        create_local_dask_cluster(spare_mem='4Gb')
    
    Parameters
    ----------  
    spare_mem : String, optional
        The amount of memory, in Gb, to leave for the notebook to run.
        This memory will not be used by the cluster. e.g '3Gb'
    display_client : Bool, optional
        An optional boolean indicating whether to display a summary of
        the dask client, including a link to monitor progress of the
        analysis. Set to False to hide this display.
    
    """

    if 'AWS_ACCESS_KEY_ID' in os.environ:
        
        # Close previous client if any
        client = locals().get('client', None)
        if client is not None:
            client.close()
            del client
        
        # Configure dashboard link to go over proxy
        dask.config.set({"distributed.dashboard.link":
                     os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')+"proxy/{port}/status"})
                
        # Start up a local cluster
        client = start_local_dask(mem_safety_margin=spare_mem)

        ## Configure GDAL for s3 access
        configure_s3_access(aws_unsigned=True,  
                            client=client);
    else:        
        
        # Close previous client if any
        client = locals().get('client', None)
        if client is not None:
            client.close()
            del client
            
        # Start up a local cluster on NCI
        client = start_local_dask(mem_safety_margin=spare_mem)

    # Show the dask cluster settings
    if display_client:
        display(client)

예제 #4

0

파일 보기

파일: deafrica_dask.py 프로젝트: SANSA-DESA/desa-sandbox-notebooks

def create_local_dask_cluster(spare_mem='3Gb',
                              aws_unsigned= True,
                              display_client=True,
                              **kwargs):
    """
    Using the datacube utils function 'start_local_dask', generate
    a local dask cluster.
    
    Example use :
        
        import sys
        sys.path.append("../Scripts")
        from deafrica_dask import create_local_dask_cluster
        
        create_local_dask_cluster(spare_mem='4Gb')
    
    Parameters
    ----------  
    spare_mem : String, optional
        The amount of memory, in Gb, to leave for the notebook to run.
        This memory will not be used by the cluster. e.g '3Gb'
    aws_unsigned : Bool, optional
         This parameter determines if credentials for S3 access are required and
         passes them on to processing threads, either local or on dask cluster. 
         Set to True if working with publicly available datasets, and False if
         working with private data. i.e if loading Landsat C2 provisional data set 
         this to aws_unsigned=False
    display_client : Bool, optional
        An optional boolean indicating whether to display a summary of
        the dask client, including a link to monitor progress of the
        analysis. Set to False to hide this display.
    **kwargs:
        Additional keyword arguments that will be passed to start_local_dask().
        E.g. n_workers can be set to be greater than 1.
    """

    # configure dashboard link to go over proxy
    dask.config.set({"distributed.dashboard.link":
                 os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')+"proxy/{port}/status"})

    # start up a local cluster  
    client = start_local_dask(mem_safety_margin=spare_mem, **kwargs)

    ## Configure GDAL for s3 access
    configure_s3_access(aws_unsigned=aws_unsigned,  
                        client=client);

    # Show the dask cluster settings
    if display_client:
        display(client)

예제 #5

0

파일 보기

파일: tasks.py 프로젝트: vutrungduc7593/data_cube_ui

 def __init__(self):
     # Set some Celery env vars.
     # Tried (1) sourcing `/var/www/[.profile|.bashrc]` in `/etc/init.d/data_cube_ui`,
     # (2) writing to `/etc/default/[data_cube_ui|celerybeat]`, and
     # (3) writing a Python file to define Celery environment variables to `exec` in Celery code.
     # All were unsuccessful.
     # TODO: This should run on worker init, not when running a task.
     if '/miniconda/envs/odc/bin' not in os.environ['PATH']:
         os.environ[
             'PATH'] = '/miniconda/envs/odc/bin:' + os.environ['PATH']
     # Configure ODC to load from requester-pays S3 buckets.
     if os.environ.get('AWS_ACCESS_KEY_ID') is not None and \
        os.environ.get('AWS_SECRET_ACCESS_KEY') is not None:
         configure_s3_access(requester_pays=True)

예제 #6

0

파일 보기

def test_rio_configure_aws_access(monkeypatch, without_aws_env, dask_client):
    monkeypatch.setenv("AWS_ACCESS_KEY_ID", "fake-key-id")
    monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "fake-secret")
    monkeypatch.setenv("AWS_DEFAULT_REGION", "fake-region")

    creds = configure_s3_access()
    cc = creds.get_frozen_credentials()
    assert cc.access_key == 'fake-key-id'
    assert cc.secret_key == 'fake-secret'
    assert cc.token is None

    ee = activate_from_config()
    assert ee is not None
    assert 'AWS_ACCESS_KEY_ID' in ee
    assert 'AWS_SECRET_ACCESS_KEY' in ee
    assert 'AWS_REGION' in ee
    assert 'AWS_SESSION_TOKEN' not in ee

    ee = get_rio_env(sanitize=False)
    assert ee is not None
    assert ee['AWS_ACCESS_KEY_ID'] == 'fake-key-id'
    assert ee['AWS_SECRET_ACCESS_KEY'] == 'fake-secret'
    assert ee['AWS_REGION'] == 'fake-region'
    assert ee['GDAL_DISABLE_READDIR_ON_OPEN'] == 'EMPTY_DIR'

    ee_local = ee
    client = dask_client

    creds = configure_s3_access(client=client)
    cc = creds.get_frozen_credentials()
    assert cc.access_key == 'fake-key-id'
    assert cc.secret_key == 'fake-secret'
    assert cc.token is None

    ee = client.submit(activate_from_config).result()
    assert ee is not None
    assert 'AWS_ACCESS_KEY_ID' in ee
    assert 'AWS_SECRET_ACCESS_KEY' in ee
    assert 'AWS_REGION' in ee
    assert 'AWS_SESSION_TOKEN' not in ee

    def _activate_and_get(sanitize=True):
        activate_from_config()
        return get_rio_env(sanitize=sanitize)

    ee = client.submit(_activate_and_get, sanitize=False).result()
    assert ee == ee_local

예제 #7

0

파일 보기

파일: __init__.py 프로젝트: opendatacube/datacube-wps

    def query_handler(self, time, feature, dask_client=None, parameters=None):
        if parameters is None:
            parameters = {}

        if dask_client is None:
            dask_client = Client(
                n_workers=num_workers(), processes=True, threads_per_worker=1
            )

        with dask_client:
            configure_s3_access(
                aws_unsigned=True,
                region_name=os.getenv("AWS_DEFAULT_REGION", "auto"),
                client=dask_client,
            )

            with datacube.Datacube() as dc:
                data = self.input_data(dc, time, feature)

        df = self.process_data(data, {"time": time, "feature": feature, **parameters})
        chart = self.render_chart(df)

        return {"data": df, "chart": chart}

예제 #8

0

파일 보기

    def __init__(
        self,
        config: Optional[FeaturePathConfig] = None,
        geobox_dict: Optional[Dict] = None,
        client: Optional[Client] = None,
    ):
        self.config = config if config else FeaturePathConfig()
        self.geobox_dict = geobox_dict
        if not client:
            nthreads = get_max_cpu()
            memory_limit = get_max_mem()
            client = start_local_dask(
                threads_per_worker=nthreads,
                processes=False,
                memory_limit=int(0.9 * memory_limit),
            )
            configure_s3_access(aws_unsigned=True,
                                cloud_defaults=True,
                                client=client)
        self.client = client

        setup_logging()
        self._log = logging.getLogger(__name__)

예제 #9

0

파일 보기

파일: s2-GM-stats.py 프로젝트: danlipsa/odc-tools

from IPython.display import display, Image

import matplotlib.pyplot as plt

plt.rcParams["axes.facecolor"] = "magenta"  # makes transparent pixels obvious
import numpy as np
import xarray as xr

# %%
from dask.distributed import Client, wait as dask_wait
from datacube.utils.dask import start_local_dask
from datacube.utils.rio import configure_s3_access

if False:
    client = start_local_dask(scheduler_port=11311, threads_per_worker=16)
    configure_s3_access(aws_unsigned=True, cloud_defaults=True, client=client)
else:
    client = Client("tcp://127.0.0.1:11311")

client.restart()
client

# %%
from odc.algo import to_rgba
from odc.ui import to_jpeg_data


def mk_roi(y, x, sz=256):
    return np.s_[y : y + sz, x : x + sz]

예제 #10

0

파일 보기

파일: _cli_run_gm.py 프로젝트: MatthewJA/odc-tools

def run_gm(cache_file, tasks, dryrun, verbose, threads, x_chunks, y_chunks,
           overwrite, public, location):
    """
    Run Pixel Quality stats

    Task could be one of the 3 things

    \b
    1. Comma-separated triplet: period,x,y or 'x[+-]<int>/y[+-]<int>/period
       2019--P1Y,+003,-004
       2019--P1Y/3/-4          `/` is also accepted
       x+003/y-004/2019--P1Y   is accepted as well
    2. A zero based index
    3. A slice following python convention <start>:<stop>[:<step]
        ::10 -- every tenth task: 0,10,20,..
       1::10 -- every tenth but skip first one 1, 11, 21 ..
        :100 -- first 100 tasks

    If no tasks are supplied the whole file will be processed.
    """
    from tqdm.auto import tqdm
    from functools import partial
    import dask
    import psutil
    from .io import S3COGSink
    from ._gm import gm_input_data, gm_reduce, gm_product
    from .proc import process_tasks
    from .tasks import TaskReader
    from datacube.utils.dask import start_local_dask
    from datacube.utils.rio import configure_s3_access

    dask.config.set({'distributed.worker.memory.target': False})
    dask.config.set({'distributed.worker.memory.spill': False})
    dask.config.set({'distributed.worker.memory.pause': False})
    dask.config.set({'distributed.worker.memory.terminate': False})

    # config
    resampling = 'bilinear'
    COG_OPTS = dict(
        compress='deflate',
        predict=2,
        zlevel=6,
        blocksize=800,
        ovr_blocksize=
        256,  # ovr_blocksize must be powers of 2 for some reason in GDAL
        overview_resampling='bilinear')
    ncpus = psutil.cpu_count()
    # ..

    if threads <= 0:
        threads = ncpus

    rdr = TaskReader(cache_file)
    product = gm_product(location=location)

    if verbose:
        print(repr(rdr))

    def _proc(task):
        NY, NX = task.geobox.shape

        ds_in = gm_input_data(task,
                              resampling=resampling,
                              chunk=(NY // y_chunks, NX))
        tdim = list(ds_in.dims)[0]
        ds_in = ds_in.chunk({tdim: -1, 'x': NX // x_chunks})
        ds = gm_reduce(ds_in,
                       num_threads=ncpus // x_chunks + 2,
                       wk_rows=(NY // y_chunks) // 4,
                       as_array=True)
        return ds

    def dry_run_proc(task, sink, check_s3=False):
        uri = sink.uri(task)
        exists = None
        if check_s3:
            exists = sink.exists(task)

        nds = len(task.datasets)
        ndays = len(set(ds.center_time.date() for ds in task.datasets))

        if overwrite:
            flag = {None: '', True: ' (recompute)', False: ' (new)'}[exists]
        else:
            flag = {None: '', True: ' (skip)', False: ' (new)'}[exists]

        task_id = f"{task.short_time}/{task.tile_index[0]:+05d}/{task.tile_index[1]:+05d}"
        print(f"{task_id} days={ndays:03} ds={nds:04} {uri}{flag}")

        return uri

    if len(tasks) == 0:
        tasks = rdr.all_tiles
        if verbose:
            print(f"Found {len(tasks):,d} tasks in the file")
    else:
        try:
            tasks = parse_all_tasks(tasks, rdr.all_tiles)
        except ValueError as e:
            print(str(e), file=sys.stderr)
            sys.exit(1)

    if verbose:
        print(f"Will process {len(tasks):,d} tasks")

    sink = S3COGSink(cog_opts=COG_OPTS, public=public)

    if product.location.startswith('s3:'):
        if not sink.verify_s3_credentials():
            print("Failed to load S3 credentials")
            sys.exit(2)

    if verbose and sink._creds:
        creds_rw = sink._creds
        print(
            f'creds: ..{creds_rw.access_key[-5:]} ..{creds_rw.secret_key[-5:]}'
        )

    _tasks = rdr.stream(tasks, product)

    client = None
    if not dryrun:
        if verbose:
            print("Starting local Dask cluster")

        client = start_local_dask(threads_per_worker=threads,
                                  mem_safety_margin='1G')

        # TODO: aws_unsigned is not always desirable
        configure_s3_access(aws_unsigned=True,
                            cloud_defaults=True,
                            client=client)
        if verbose:
            print(client)

    if dryrun:
        results = map(partial(dry_run_proc, sink=sink, check_s3=not overwrite),
                      _tasks)
    else:
        results = process_tasks(_tasks,
                                _proc,
                                client,
                                sink,
                                check_exists=not overwrite,
                                chunked_persist=x_chunks,
                                verbose=verbose)
    if not dryrun and verbose:
        results = tqdm(results, total=len(tasks))

    for p in results:
        if verbose and not dryrun:
            print(p)

    if verbose:
        print("Exiting")

    if client is not None:
        client.close()

예제 #11

0

파일 보기

파일: dask.py 프로젝트: sebfoe/data_cube_utilities

def create_local_dask_cluster(spare_mem='3Gb',
                              aws_unsigned=True,
                              display_client=True,
                              start_local_dask_kwargs=None,
                              configure_s3_access_kwargs=None):
    """
    Credit belongs to Digital Earth Africa:
    https://github.com/digitalearthafrica/deafrica-sandbox-notebooks/blob/master/Scripts/deafrica_dask.py
    
    Using the datacube utils function 'start_local_dask', generate
    a local dask cluster.
    
    Example use :
        
        import sys
        sys.path.append("../Scripts")
        from deafrica_dask import create_local_dask_cluster
        
        create_local_dask_cluster(spare_mem='4Gb')
    
    Parameters
    ----------  
    spare_mem : String, optional
        The amount of memory, in Gb, to leave for the notebook to run.
        This memory will not be used by the cluster. e.g '3Gb'
    aws_unsigned : Bool, optional
         This parameter determines if credentials for S3 access are required and
         passes them on to processing threads, either local or on dask cluster. 
         Set to True if working with publicly available datasets, and False if
         working with private data. i.e if loading Landsat C2 provisional data set 
         this to aws_unsigned=False
    display_client : Bool, optional
        An optional boolean indicating whether to display a summary of
        the dask client, including a link to monitor progress of the
        analysis. Set to False to hide this display.
    start_local_dask_kwargs: dict, optional
        Keyword arguments for the function `datacube.utils.dask.start_local_dask`, which
        creates the Dask client.
        Some settings to configure include the number of workers, number of threads per worker, and the memory limit.
    configure_s3_access_kwargs: dict, optional
        Keyword arguments for the function `datacube.utils.rio.configure_s3_access`, which
        configures the Dask to access S3.
    """
    start_local_dask_kwargs = {} if start_local_dask_kwargs is None else start_local_dask_kwargs
    configure_s3_access_kwargs = {} if configure_s3_access_kwargs is None else configure_s3_access_kwargs

    # configure dashboard link to go over proxy
    dask.config.set({
        "distributed.dashboard.link":
        os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/') +
        "proxy/{port}/status"
    })

    # start up a local cluster
    num_physical_cpu = psutil.cpu_count(logical=False)
    num_logical_cpu = psutil.cpu_count(logical=True)
    num_logical_per_physical = num_logical_cpu / num_physical_cpu
    start_local_dask_kwargs.setdefault('n_workers', num_physical_cpu - 1)
    start_local_dask_kwargs.setdefault(
        'threads_per_worker',
        int(num_logical_per_physical * start_local_dask_kwargs['n_workers']))
    client = start_local_dask(mem_safety_margin=spare_mem,
                              **start_local_dask_kwargs)

    ## Configure GDAL for s3 access
    configure_s3_access(aws_unsigned=aws_unsigned,
                        client=client,
                        **configure_s3_access_kwargs)

    return client

예제 #12

0

파일 보기

파일: _cli_run_pq.py 프로젝트: MatthewJA/odc-tools

def run_pq(cache_file, tasks, dryrun, verbose, threads, overwrite, public,
           location):
    """
    Run Pixel Quality stats

    Task could be one of the 3 things

    \b
    1. Comma-separated triplet: period,x,y or 'x[+-]<int>/y[+-]<int>/period
       2019--P1Y,+003,-004
       2019--P1Y/3/-4          `/` is also accepted
       x+003/y-004/2019--P1Y   is accepted as well
    2. A zero based index
    3. A slice following python convention <start>:<stop>[:<step]
        ::10 -- every tenth task: 0,10,20,..
       1::10 -- every tenth but skip first one 1, 11, 21 ..
        :100 -- first 100 tasks

    If no tasks are supplied the whole file will be processed.
    """
    from tqdm.auto import tqdm
    from functools import partial
    from .io import S3COGSink
    from ._pq import pq_input_data, pq_reduce, pq_product
    from .proc import process_tasks
    from .tasks import TaskReader
    from datacube.utils.dask import start_local_dask
    from datacube.utils.rio import configure_s3_access

    # config
    resampling = 'nearest'
    COG_OPTS = dict(compress='deflate', predict=2, zlevel=6, blocksize=800)
    # ..

    rdr = TaskReader(cache_file)
    product = pq_product(location=location)

    if verbose:
        print(repr(rdr))

    def pq_proc(task):
        ds_in = pq_input_data(task, resampling=resampling)
        ds = pq_reduce(ds_in)
        return ds

    def dry_run_proc(task, sink, check_s3=False):
        uri = sink.uri(task)
        exists = None
        if check_s3:
            exists = sink.exists(task)

        nds = len(task.datasets)
        ndays = len(set(ds.center_time.date() for ds in task.datasets))

        if overwrite:
            flag = {None: '', True: ' (recompute)', False: ' (new)'}[exists]
        else:
            flag = {None: '', True: ' (skip)', False: ' (new)'}[exists]

        task_id = f"{task.short_time}/{task.tile_index[0]:+05d}/{task.tile_index[1]:+05d}"
        print(f"{task_id} days={ndays:03} ds={nds:04} {uri}{flag}")

        return uri

    if len(tasks) == 0:
        tasks = rdr.all_tiles
        if verbose:
            print(f"Found {len(tasks):,d} tasks in the file")
    else:
        try:
            tasks = parse_all_tasks(tasks, rdr.all_tiles)
        except ValueError as e:
            print(str(e), file=sys.stderr)
            sys.exit(1)

    if verbose:
        print(f"Will process {len(tasks):,d} tasks")

    sink = S3COGSink(cog_opts=COG_OPTS, public=public)

    if product.location.startswith('s3:'):
        if not sink.verify_s3_credentials():
            print("Failed to load S3 credentials")
            sys.exit(2)

    if verbose and sink._creds:
        creds_rw = sink._creds
        print(
            f'creds: ..{creds_rw.access_key[-5:]} ..{creds_rw.secret_key[-5:]}'
        )

    _tasks = rdr.stream(tasks, product)

    client = None
    if not dryrun:
        if verbose:
            print("Starting local Dask cluster")

        client = start_local_dask(threads_per_worker=threads,
                                  mem_safety_margin='1G')

        # TODO: aws_unsigned is not always desirable
        configure_s3_access(aws_unsigned=True,
                            cloud_defaults=True,
                            client=client)
        if verbose:
            print(client)

    if dryrun:
        results = map(partial(dry_run_proc, sink=sink, check_s3=not overwrite),
                      _tasks)
    else:
        results = process_tasks(_tasks,
                                pq_proc,
                                client,
                                sink,
                                check_exists=not overwrite,
                                verbose=verbose)
    if not dryrun and verbose:
        results = tqdm(results, total=len(tasks))

    for p in results:
        if verbose and not dryrun:
            print(p)

    if verbose:
        print("Exiting")

    if client is not None:
        client.close()