Пример #1
0
class IndexGEETestCase(unittest.TestCase):
    def setUp(self):
        self.datacube = Datacube(config=DATACUBE_CONFIG)

        IndexerTestCase().test_product_generation()
        product = self.datacube.index.products.get_by_name('ls8_test')
        if product is None:
            self.skipTest('No product available to index')
        datasets = self.datacube.find_datasets(product='ls8_test')
        if datasets:
            self.skipTest('Indexed datasets already exist in database')

    def test_index_gee(self):
        product = 'ls8_test'
        latitude = (-4.15, -3.90)
        longitude = (39.50, 39.75)
        time = '2020-01'

        cmd = [
            "index_gee", "--product", product, "--latitude",
            str(latitude), "--longitude",
            str(longitude), "--time", time, "--config", DATACUBE_CONFIG,
            "--no_confirm", "-u"
        ]
        subprocess.check_output(cmd)
        datasets = self.datacube.find_datasets(product=product)
        self.assertGreater(len(datasets), 0,
                           'Expected to find datasets in index')
Пример #2
0
def test_query_dataset_multi_product(index: Index, ls5_dataset_w_children: Dataset):
    # We have one ls5 level1 and its child nbar
    dc = Datacube(index)

    # Can we query a single product name?
    datasets = dc.find_datasets(product='ls5_nbar_scene')
    assert len(datasets) == 1

    # Can we query multiple products?
    datasets = dc.find_datasets(product=['ls5_nbar_scene', 'ls5_level1_scene'])
    assert len(datasets) == 2

    # Can we query multiple products in a tuple
    datasets = dc.find_datasets(product=('ls5_nbar_scene', 'ls5_level1_scene'))
    assert len(datasets) == 2
Пример #3
0
def check_data_with_api(index, time_slices):
    """Chek retrieved data for specific values.

    We scale down by 100 and check for predefined values in the
    corners.
    """
    from datacube import Datacube
    dc = Datacube(index=index)

    # Make the retrieved data 100 less granular
    shape_x = int(GEOTIFF['shape']['x'] / 100.0)
    shape_y = int(GEOTIFF['shape']['y'] / 100.0)
    pixel_x = int(GEOTIFF['pixel_size']['x'] * 100)
    pixel_y = int(GEOTIFF['pixel_size']['y'] * 100)

    input_type_name = 'ls5_nbar_albers'
    input_type = dc.index.products.get_by_name(input_type_name)
    geobox = geometry.GeoBox(
        shape_x + 1, shape_y + 1,
        Affine(pixel_x, 0.0, GEOTIFF['ul']['x'], 0.0, pixel_y,
               GEOTIFF['ul']['y']), geometry.CRS(GEOTIFF['crs']))
    observations = dc.find_datasets(product='ls5_nbar_albers',
                                    geopolygon=geobox.extent)
    group_by = query_group_by('time')
    sources = dc.group_datasets(observations, group_by)
    data = dc.load_data(sources, geobox, input_type.measurements.values())
    assert hashlib.md5(
        data.green.data).hexdigest() == '7f5ace486e88d33edf3512e8de6b6996'
    assert hashlib.md5(
        data.blue.data).hexdigest() == 'b58204f1e10dd678b292df188c242c7e'
    for time_slice in range(time_slices):
        assert data.blue.values[time_slice][-1, -1] == -999
Пример #4
0
    def __call__(self, index, product, time, group_by) -> Tile:
        # Do for a specific poly whose boundary is known
        output_crs = CRS(self.storage['crs'])
        filtered_items = [
            'geopolygon', 'lon', 'lat', 'longitude', 'latitude', 'x', 'y'
        ]
        filtered_dict = {
            k: v
            for k, v in self.input_region.items() if k in filtered_items
        }
        if self.feature is not None:
            filtered_dict['geopolygon'] = self.feature.geopolygon
            geopoly = filtered_dict['geopolygon']
        else:
            geopoly = query_geopolygon(**self.input_region)

        dc = Datacube(index=index)
        datasets = dc.find_datasets(product=product,
                                    time=time,
                                    group_by=group_by,
                                    **filtered_dict)
        group_by = query_group_by(group_by=group_by)
        sources = dc.group_datasets(datasets, group_by)
        output_resolution = [
            self.storage['resolution'][dim] for dim in output_crs.dimensions
        ]
        geopoly = geopoly.to_crs(output_crs)
        geobox = GeoBox.from_geopolygon(geopoly, resolution=output_resolution)

        return Tile(sources, geobox)
Пример #5
0
 def __get_mask_datasets(self) -> List[ODCDataset]:
     """ Finds mask datasets based on config """
     dc = Datacube(app="mosaic_creator")
     time_range = (str(self.__start_date), str(self.__end_date))
     datasets = dc.find_datasets(product=self.__product_name, time=time_range)
     if not datasets:
         LOGGER.warning("No mask datasets found for"
                        f"product={self.__product_name}, time={time_range}")
         raise ValueError("No datasets found")  # TODO: custom exception
     return datasets
Пример #6
0
def ordered_dss(dc: Datacube, freq: str = 'm', **query):
    """Emulate "order by time" streaming interface for datacube queries.

        Basic idea is to perform a lot of smaller queries (shorter time
        periods), sort results then yield them to the calling code.
    """
    qq = Query(**query)

    for q in chop_query_by_time(qq, freq=freq):
        dss = dc.find_datasets(**q.search_terms)
        dss.sort(key=lambda ds: ds.center_time)
        yield from dss
Пример #7
0
def check_open_with_api(driver_manager, time_slices):
    from datacube import Datacube
    dc = Datacube(driver_manager=driver_manager)

    input_type_name = 'ls5_nbar_albers'
    input_type = dc.index.products.get_by_name(input_type_name)
    geobox = geometry.GeoBox(200, 200, Affine(25, 0.0, 638000, 0.0, -25, 6276000), geometry.CRS('EPSG:28355'))
    observations = dc.find_datasets(product='ls5_nbar_albers', geopolygon=geobox.extent)
    group_by = query_group_by('time')
    sources = dc.group_datasets(observations, group_by)
    data = dc.load_data(sources, geobox, input_type.measurements.values(), driver_manager=driver_manager)
    assert data.blue.shape == (time_slices, 200, 200)
Пример #8
0
def check_open_with_api(index):
    from datacube import Datacube
    dc = Datacube(index=index)

    input_type_name = 'ls5_nbar_albers'
    input_type = dc.index.products.get_by_name(input_type_name)

    geobox = GeoBox(200, 200, Affine(25, 0.0, 1500000, 0.0, -25, -3900000),
                    CRS('EPSG:3577'))
    observations = dc.find_datasets(product='ls5_nbar_albers',
                                    geopolygon=geobox.extent)
    group_by = query_group_by('time')
    sources = dc.group_datasets(observations, group_by)
    data = dc.load_data(sources, geobox, input_type.measurements.values())
    assert data.blue.shape == (1, 200, 200)
Пример #9
0
class ArbitraryTileMaker(object):
    """
    Create a :class:`Tile` which can be used by :class:`GridWorkflow` to later load the required data.

    :param input_region: dictionary of spatial limits for searching for datasets. eg:
            geopolygon
            lat, lon boundaries

    """
    def __init__(self, index, input_region, storage):
        self.dc = Datacube(index=index)
        self.input_region = input_region
        self.storage = storage

    def __call__(self, product, time, group_by) -> Tile:
        # Do for a specific poly whose boundary is known
        output_crs = CRS(self.storage['crs'])
        filtered_item = [
            'geopolygon', 'lon', 'lat', 'longitude', 'latitude', 'x', 'y'
        ]
        filtered_dict = {
            k: v
            for k, v in filter(lambda t: t[0] in filtered_item,
                               self.input_region.items())
        }
        if 'feature_id' in self.input_region:
            filtered_dict['geopolygon'] = Geometry(
                self.input_region['geom_feat'],
                CRS(self.input_region['crs_txt']))
            geopoly = filtered_dict['geopolygon']
        else:
            geopoly = query_geopolygon(**self.input_region)
        datasets = self.dc.find_datasets(product=product,
                                         time=time,
                                         group_by=group_by,
                                         **filtered_dict)
        group_by = query_group_by(group_by=group_by)
        sources = self.dc.group_datasets(datasets, group_by)
        output_resolution = [
            self.storage['resolution'][dim] for dim in output_crs.dimensions
        ]
        geopoly = geopoly.to_crs(output_crs)
        geobox = GeoBox.from_geopolygon(geopoly, resolution=output_resolution)

        return Tile(sources, geobox)
Пример #10
0
def check_open_with_api(index, time_slices):
    with rasterio.Env():
        from datacube import Datacube
        dc = Datacube(index=index)

        input_type_name = 'ls5_nbar_albers'
        input_type = dc.index.products.get_by_name(input_type_name)
        geobox = geometry.GeoBox(200, 200, Affine(25, 0.0, 638000, 0.0, -25, 6276000), geometry.CRS('EPSG:28355'))
        observations = dc.find_datasets(product='ls5_nbar_albers', geopolygon=geobox.extent)
        group_by = query_group_by('time')
        sources = dc.group_datasets(observations, group_by)
        data = dc.load_data(sources, geobox, input_type.measurements.values())
        assert data.blue.shape == (time_slices, 200, 200)

        chunk_profile = {'time': 1, 'x': 100, 'y': 100}
        lazy_data = dc.load_data(sources, geobox, input_type.measurements.values(), dask_chunks=chunk_profile)
        assert lazy_data.blue.shape == (time_slices, 200, 200)
        assert (lazy_data.blue.load() == data.blue).all()
Пример #11
0
def check_data_with_api(index, time_slices):
    """Chek retrieved data for specific values.

    We scale down by 100 and check for predefined values in the
    corners.
    """
    from datacube import Datacube
    dc = Datacube(index=index)

    # TODO: this test needs to change, it tests that results are exactly the
    #       same as some time before, but with the current zoom out factor it's
    #       hard to verify that results are as expected even with human
    #       judgement. What it should test is that reading native from the
    #       ingested product gives exactly the same results as reading into the
    #       same GeoBox from the original product. Separate to that there
    #       should be a read test that confirms that what you read from native
    #       product while changing projection is of expected value

    # Make the retrieved data lower res
    ss = 100
    shape_x = int(GEOTIFF['shape']['x'] / ss)
    shape_y = int(GEOTIFF['shape']['y'] / ss)
    pixel_x = int(GEOTIFF['pixel_size']['x'] * ss)
    pixel_y = int(GEOTIFF['pixel_size']['y'] * ss)

    input_type_name = 'ls5_nbar_albers'
    input_type = dc.index.products.get_by_name(input_type_name)
    geobox = geometry.GeoBox(
        shape_x + 2, shape_y + 2,
        Affine(pixel_x, 0.0, GEOTIFF['ul']['x'], 0.0, pixel_y,
               GEOTIFF['ul']['y']), geometry.CRS(GEOTIFF['crs']))
    observations = dc.find_datasets(product='ls5_nbar_albers',
                                    geopolygon=geobox.extent)
    group_by = query_group_by('time')
    sources = dc.group_datasets(observations, group_by)
    data = dc.load_data(sources, geobox, input_type.measurements.values())
    assert hashlib.md5(
        data.green.data).hexdigest() == '0f64647bad54db4389fb065b2128025e'
    assert hashlib.md5(
        data.blue.data).hexdigest() == '41a7b50dfe5c4c1a1befbc378225beeb'
    for time_slice in range(time_slices):
        assert data.blue.values[time_slice][-1, -1] == -999
Пример #12
0
def ordered_dss(dc: Datacube, freq: str = "m", key=None, **query):
    """Emulate "order by time" streaming interface for datacube queries.

        Basic idea is to perform a lot of smaller queries (shorter time
        periods), sort results then yield them to the calling code.

    :param dc: Datacube instance

    :param freq: 'm' month sized chunks, 'w' week sized chunks, 'd' day

    :param key: Optional sorting function Dataset -> Comparable, for example
                ``lambda ds: (ds.center_time, ds.metadata.region_code)``
    """
    qq = Query(**query)
    if key is None:
        key = lambda ds: ds.center_time

    for q in chop_query_by_time(qq, freq=freq):
        dss = dc.find_datasets(**q.search_terms)
        dss.sort(key=key)
        yield from dss
Пример #13
0
    def query(self, dc: Datacube,
              **search_terms: Dict[str, Any]) -> VirtualDatasetBag:
        product = dc.index.products.get_by_name(self._product)
        if product is None:
            raise VirtualProductException("could not find product {}".format(
                self._product))

        merged_terms = merge_search_terms(
            reject_keys(self, self._NON_QUERY_KEYS),
            reject_keys(search_terms, self._NON_QUERY_KEYS))

        query = Query(
            dc.index, **reject_keys(merged_terms,
                                    self._ADDITIONAL_SEARCH_KEYS))
        self._assert(
            query.product == self._product,
            "query for {} returned another product {}".format(
                self._product, query.product))

        return VirtualDatasetBag(dc.find_datasets(**merged_terms),
                                 query.geopolygon, {product.name: product})
Пример #14
0
class DatacubeReplicator(object):
    def __init__(self, config):
        self.remote_host = config['remote_host']
        self.remote_user = config['remote_user']
        self.db_password = config['db_password']
        self.remote_dir = config['remote_dir']
        self.local_dir = config['local_dir']
        self.replication_defns = config['replicated_data']

        self.client = None
        self.sftp = None
        self.tunnel = None
        self.remote_dc_config = None
        self.remote_dc = None
        self.local_index = index_connect()

    def run(self):
        self.connect()
        self.read_remote_config()
        self.connect_to_db()
        self.replicate_all()
        self.disconnect()

    def connect(self):
        client = SSHClient()
        client.load_system_host_keys()
        client.set_missing_host_key_policy(WarningPolicy())
        client.connect(hostname=self.remote_host, username=self.remote_user)

        LOG.debug(client)
        self.client = client
        self.sftp = client.open_sftp()

    def disconnect(self):
        self.client.close()
        self.tunnel.stop()

    def read_remote_config(self):
        remote_config = ConfigParser()
        remote_config.read_string(_DEFAULT_CONF)
        with self.sftp.open('.datacube.conf') as fin:
            remote_config.read_file(fin)
        self.remote_dc_config = LocalConfig(remote_config)

    def connect_to_db(self):
        self.tunnel = SSHTunnelForwarder(
            self.remote_host,
            ssh_username=self.remote_user,
            remote_bind_address=(self.remote_dc_config.db_hostname,
                                 int(self.remote_dc_config.db_port)))
        self.tunnel.start()

        # pylint: disable=protected-access
        self.remote_dc_config._config['datacube']['db_hostname'] = '127.0.0.1'
        self.remote_dc_config._config['datacube']['db_port'] = str(
            self.tunnel.local_bind_port)
        self.remote_dc_config._config['datacube'][
            'db_username'] = self.remote_user
        self.remote_dc_config._config['datacube'][
            'db_password'] = self.db_password

        # This requires the password from somewhere
        # Parsing it out of .pgpass sounds error prone and fragile
        # Lets put it in the configuration for now
        LOG.debug('Remote configuration loaded %s', self.remote_dc_config)

        self.remote_dc = Datacube(config=self.remote_dc_config)

    def replicate_all(self):

        for defn in tqdm(self.replication_defns, 'Replicating products'):
            self.replicate(defn)

    def replicate_all_products(self):
        products = self.remote_dc.index.products.get_all()
        for product in products:
            self.local_index.products.add(product)

    def replicate(self, defn):
        datasets = list(self.remote_dc.find_datasets(**defn))

        if not datasets:
            LOG.info('No remote datasets found matching %s', defn)
            return

        # TODO: use generator not list
        product = datasets[0].type
        LOG.info('Ensuring remote product is in local index. %s', product)

        self.local_index.products.add(product)

        for dataset in tqdm(datasets, 'Datasets'):
            # dataset = remote_dc.index.datasets.get(dataset.id, include_sources=True)
            # We would need to pull the parent products down too
            # TODO: Include parent source datasets + product definitions
            dataset.sources = {}

            LOG.debug('Replicating dataset %s', dataset)
            remote_path = uri_to_path(dataset.local_uri)
            local_path = self.remote_to_local(uri_to_path(dataset.local_uri))

            # Ensure local path exists
            Path(local_path).parent.mkdir(parents=True, exist_ok=True)

            # Download file
            self.sftp.get(remote_path, local_path)

            # Add to local index
            dataset.local_uri = 'file://' + local_path
            self.local_index.datasets.add(dataset)
            LOG.debug('Downloaded to %s', local_path)

    def remote_to_local(self, remote):
        return remote.replace(self.remote_dir, self.local_dir)
Пример #15
0
def get_data_opensource(prod_info, input_lon, input_lat, acq_min, acq_max,
                        window_size, no_partial_scenes):

    datacube_config = prod_info[0]
    source_prod = prod_info[1]
    source_band_list = prod_info[2]
    mask_band = prod_info[3]

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        if datacube_config != 'default':
            remotedc = Datacube(config=datacube_config)
        else:
            remotedc = Datacube()

        return_data = {}
        data = xr.Dataset()

        if source_prod != '':
            # find dataset to get metadata
            fd_query = {
                'time': (acq_min, acq_max),
                'x': (input_lon, input_lon + window_size / 100000),
                'y': (input_lat, input_lat + window_size / 100000),
            }
            sample_fd_ds = remotedc.find_datasets(product=source_prod,
                                                  group_by='solar_day',
                                                  **fd_query)

            if (len(sample_fd_ds)) > 0:
                # decidce pixel size for output data
                pixel_x, pixel_y = get_pixel_size(sample_fd_ds[0],
                                                  source_band_list)

                log.info('Output pixel size for product {}: x={}, y={}'.format(
                    source_prod, pixel_x, pixel_y))

                # get target epsg from metadata
                target_epsg = get_epsg(sample_fd_ds[0])
                log.info('CRS for product {}: {}'.format(
                    source_prod, target_epsg))

                x1, y1, x2, y2 = setQueryExtent(target_epsg, input_lon,
                                                input_lat, window_size)

                query = {
                    'time': (acq_min, acq_max),
                    'x': (x1, x2),
                    'y': (y1, y2),
                    'crs': target_epsg,
                    'output_crs': target_epsg,
                    'resolution': (-pixel_y, pixel_x),
                    'measurements': source_band_list
                }

                if 's2' in source_prod:
                    data = remotedc.load(product=source_prod,
                                         group_by='solar_day',
                                         **query)
                else:
                    data = remotedc.load(product=source_prod,
                                         align=(pixel_x / 2.0, pixel_y / 2.0),
                                         group_by='solar_day',
                                         **query)
                # remove cloud and nodta
                data = remove_cloud_nodata(source_prod, data, mask_band)

                if no_partial_scenes:
                    # calculate valid data percentage
                    data = only_return_whole_scene(data)

            return_data = {
                source_prod: {
                    'data': data,
                    'mask_band': mask_band,
                    'find_list': sample_fd_ds
                }
            }

    return return_data
Пример #16
0
def get_data_opensource_shapefile(prod_info, acq_min, acq_max, shapefile,
                                  no_partial_scenes):

    datacube_config = prod_info[0]
    source_prod = prod_info[1]
    source_band_list = prod_info[2]
    mask_band = prod_info[3]

    if datacube_config != 'default':
        remotedc = Datacube(config=datacube_config)
    else:
        remotedc = Datacube()

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        with fiona.open(shapefile) as shapes:
            crs = geometry.CRS(shapes.crs_wkt)
            first_geometry = next(iter(shapes))['geometry']
            geom = geometry.Geometry(first_geometry, crs=crs)

            return_data = {}
            data = xr.Dataset()

            if source_prod != '':
                # get a sample dataset to decide the target epsg
                fd_query = {'time': (acq_min, acq_max), 'geopolygon': geom}
                sample_fd_ds = remotedc.find_datasets(product=source_prod,
                                                      group_by='solar_day',
                                                      **fd_query)

                if (len(sample_fd_ds)) > 0:
                    # decidce pixel size for output data
                    pixel_x, pixel_y = get_pixel_size(sample_fd_ds[0],
                                                      source_band_list)
                    log.info(
                        'Output pixel size for product {}: x={}, y={}'.format(
                            source_prod, pixel_x, pixel_y))

                    # get target epsg from metadata
                    target_epsg = get_epsg(sample_fd_ds[0])
                    log.info('CRS for product {}: {}'.format(
                        source_prod, target_epsg))

                    query = {
                        'time': (acq_min, acq_max),
                        'geopolygon': geom,
                        'output_crs': target_epsg,
                        'resolution': (-pixel_y, pixel_x),
                        'measurements': source_band_list
                    }

                    if 's2' in source_prod:
                        data = remotedc.load(product=source_prod,
                                             group_by='solar_day',
                                             **query)
                    else:
                        data = remotedc.load(product=source_prod,
                                             align=(pixel_x / 2.0,
                                                    pixel_y / 2.0),
                                             group_by='solar_day',
                                             **query)

                    # remove cloud and nodta
                    data = remove_cloud_nodata(source_prod, data, mask_band)

                    if data.data_vars:
                        mask = geometry_mask([geom], data.geobox, invert=True)
                        data = data.where(mask)

                    if no_partial_scenes:
                        # calculate valid data percentage
                        data = only_return_whole_scene(data)

                return_data = {
                    source_prod: {
                        'data': data,
                        'mask_band': mask_band,
                        'find_list': sample_fd_ds
                    }
                }

    return return_data
#!/usr/bin/env python

from datetime import date

from datacube import Datacube

dc = Datacube()

# s3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/2020-10-19/S2B_OPER_MSI_ARD_TL_VGS1_20201019T060322_A018905_T50LNP_N02.09/ARD-METADATA.yaml
#
#  's2a_ard_granule',
#  's2b_ard_granule',
#


def ds_to_s3_url(ds):
    # return f"s3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/{ds.key_time.strftime('%Y-%m-%d')}/{ds.metadata_doc['tile_id'].replace('L1C', 'ARD')}/ARD-METADATA.yaml"
    # Don't trust the datetimes that are exposed!
    return f"s3://dea-public-data/L2/sentinel-2-nbar/S2MSIARD_NBAR/{ds.metadata_doc['extent']['center_dt'][:10]}/{ds.metadata_doc['tile_id'].replace('L1C', 'ARD')}/ARD-METADATA.yaml"


for product in ('s2a_ard_granule', 's2b_ard_granule'):
    for year in range(2017, date.today().year + 1):
        for month in range(1, 13):

            for ds in dc.find_datasets(product=product,
                                       time=f'{year}-{month:02}'):
                print(ds_to_s3_url(ds))
Пример #18
0
 def get_l1c_datasets(self) -> List[ODCDataset]:
     """ Gets all L1C datasets from ODC Index """
     dc = Datacube(app="cloud_mask_generator")
     l1c_datasets = dc.find_datasets(product=self.l1c_product_name)
     return l1c_datasets