Пример #1
0
    def query(self, dc: Datacube,
              **search_terms: Dict[str, Any]) -> VirtualDatasetBag:
        product = dc.index.products.get_by_name(self._product)
        if product is None:
            raise VirtualProductException("could not find product {}".format(
                self._product))

        originals = Query(dc.index, **reject_keys(self, self._NON_QUERY_KEYS))
        overrides = Query(dc.index,
                          **reject_keys(search_terms, self._NON_QUERY_KEYS))

        query = Query(
            dc.index,
            **merge_search_terms(originals.search_terms,
                                 overrides.search_terms))
        self._assert(
            query.product == self._product,
            "query for {} returned another product {}".format(
                self._product, query.product))

        # find the datasets
        datasets = dc.index.datasets.search(**query.search_terms)
        if query.geopolygon is not None:
            datasets = select_datasets_inside_polygon(datasets,
                                                      query.geopolygon)

        # should we put it in the Transformation class?
        if self.get('dataset_predicate') is not None:
            datasets = [
                dataset for dataset in datasets
                if self['dataset_predicate'](dataset)
            ]

        return VirtualDatasetBag(list(datasets), query.geopolygon,
                                 {product.name: product})
Пример #2
0
def test_query_kwargs():
    from mock import MagicMock

    mock_index = MagicMock()
    mock_index.datasets.get_field_names = lambda: {u'product', u'lat', u'sat_path', 'type_id', u'time', u'lon',
                                                   u'orbit', u'instrument', u'sat_row', u'platform', 'metadata_type',
                                                   u'gsi', 'type', 'id'}

    query = Query(index=mock_index, product='ls5_nbar_albers')
    assert str(query)
    assert query.product == 'ls5_nbar_albers'
    assert query.search_terms['product'] == 'ls5_nbar_albers'

    query = Query(index=mock_index, latitude=(-35, -36), longitude=(148, 149))
    assert query.geopolygon
    assert 'lat' in query.search_terms
    assert 'lon' in query.search_terms

    query = Query(index=mock_index, latitude=-35, longitude=148)
    assert query.geopolygon
    assert 'lat' in query.search_terms
    assert 'lon' in query.search_terms

    query = Query(index=mock_index, y=(-4174726, -4180011), x=(1515184, 1523263), crs='EPSG:3577')
    assert query.geopolygon
    assert 'lat' in query.search_terms
    assert 'lon' in query.search_terms

    query = Query(index=mock_index, y=-4174726, x=1515184, crs='EPSG:3577')
    assert query.geopolygon
    assert 'lat' in query.search_terms
    assert 'lon' in query.search_terms

    query = Query(index=mock_index, y=-4174726, x=1515184, crs='EPSG:3577')
    assert query.geopolygon
    assert 'lat' in query.search_terms
    assert 'lon' in query.search_terms

    query = Query(index=mock_index, time='2001')
    assert 'time' in query.search

    query = Query(index=mock_index, time=('2001', '2002'))
    assert 'time' in query.search

    with pytest.raises(ValueError):
        Query(index=mock_index,
              y=-4174726, coordinate_reference_system='WGS84',
              x=1515184, crs='EPSG:3577')

    with pytest.raises(LookupError):
        Query(index=mock_index, y=-4174726, x=1515184, crs='EPSG:3577', made_up_key='NotReal')

    with pytest.raises(LookupError):
        query_group_by(group_by='magic')

    gb = query_group_by('time')
    assert isinstance(gb, GroupBy)
    assert query_group_by(group_by=gb) is gb
Пример #3
0
    def query(self, dc: Datacube,
              **search_terms: Dict[str, Any]) -> VirtualDatasetBag:
        """ Collection of datasets that match the query. """
        get = self.get

        if 'product' in self:
            product = dc.index.products.get_by_name(self._product)
            if product is None:
                raise VirtualProductException(
                    "could not find product {}".format(self._product))

            originals = Query(dc.index,
                              **reject_keys(self, self._NON_QUERY_KEYS))
            overrides = Query(
                dc.index, **reject_keys(search_terms, self._NON_QUERY_KEYS))

            query = Query(
                dc.index,
                **merge_search_terms(originals.search_terms,
                                     overrides.search_terms))
            self._assert(
                query.product == self._product,
                "query for {} returned another product {}".format(
                    self._product, query.product))

            # find the datasets
            datasets = dc.index.datasets.search(**query.search_terms)
            if query.geopolygon is not None:
                datasets = select_datasets_inside_polygon(
                    datasets, query.geopolygon)

            # should we put it in the Transformation class?
            if get('dataset_predicate') is not None:
                datasets = [
                    dataset for dataset in datasets
                    if self['dataset_predicate'](dataset)
                ]

            return VirtualDatasetBag(list(datasets), product.grid_spec,
                                     query.geopolygon, {product.name: product})

        elif 'transform' in self:
            return self._input.query(dc, **search_terms)

        elif 'collate' in self or 'juxtapose' in self:
            result = [
                child.query(dc, **search_terms) for child in self._children
            ]

            return VirtualDatasetBag(
                {self._kind: [datasets.pile for datasets in result]},
                select_unique([datasets.grid_spec for datasets in result]),
                select_unique([datasets.geopolygon for datasets in result]),
                merge_dicts(
                    [datasets.product_definitions for datasets in result]))

        else:
            raise VirtualProductException("virtual product was not validated")
Пример #4
0
    def query(self, dc, **search_terms):
        # type: (Datacube, Dict[str, Any]) -> QueryResult
        """ Collection of datasets that match the query. """
        get = self.get

        if 'product' in self:
            originals = Query(dc.index,
                              **reject_keys(self, self._NON_QUERY_KEYS))
            overrides = Query(
                dc.index, **reject_keys(search_terms, self._NON_QUERY_KEYS))

            query = Query(
                dc.index,
                **merge_search_terms(originals.search_terms,
                                     overrides.search_terms))
            self._assert(
                query.product == self._product,
                "query for {} returned another product {}".format(
                    self._product, query.product))

            # find the datasets
            datasets = select_datasets_inside_polygon(
                dc.index.datasets.search(**query.search_terms),
                query.geopolygon)

            if get('dataset_predicate') is not None:
                datasets = [
                    dataset for dataset in datasets
                    if get('dataset_predicate')(dataset)
                ]

            # gather information from the index before it disappears from sight
            # this can also possibly extracted from the product definitions but this is easier
            grid_spec = dc.index.products.get_by_name(self._product).grid_spec

            return QueryResult(datasets, grid_spec)

        elif 'transform' in self:
            return self._input.query(dc, **search_terms)

        elif 'collate' in self or 'juxtapose' in self:
            result = [
                child.query(dc, **search_terms) for child in self._children
            ]

            grid_spec = select_unique(
                [datasets.grid_spec for datasets in result])
            return QueryResult(result, grid_spec)

        else:
            raise VirtualProductException("virtual product was not validated")
Пример #5
0
def _find_periods_with_data(index,
                            product_names,
                            period_duration='1 day',
                            start_date='1985-01-01',
                            end_date='2000-01-01'):
    """
    Search the datacube and find which periods contain data

    This is very useful when running stats in the `daily` mode (which outputs a file for each day). It is
    very slow to create an output for every day regardless of data availability, so it is better to only find
    the useful days at the beginning.

    :return: sequence of (start_date, end_date) tuples
    """
    # TODO: Read 'simple' job configuration from file
    # TODO: need get rid of the hard-coded query
    query = dict(y=(-41 * (40000 - 1600), -41 * 40000),
                 x=(15 * 40000, 15 * (40000 + 1600)),
                 crs='EPSG:3577',
                 time=(start_date, end_date))

    valid_dates = set()
    for product in product_names:
        counts = index.datasets.count_product_through_time(
            period_duration, product=product, **Query(**query).search_terms)
        for time_range, count in counts:
            if count > 0:
                time_range = Range(time_range.begin.astimezone(timezone.utc),
                                   time_range.end.astimezone(timezone.utc))
                valid_dates.add(time_range)
    for time_range in sorted(valid_dates):
        yield time_range.begin, time_range.end
Пример #6
0
def test_convert_descriptor_query_to_search_query():
    descriptor_query = {
        'dimensions': {
            'latitude': {
                'range': (-35.5, -36.5),
            },
            'longitude': {
                'range': (148.3, 149.9)
            },
            'time': {
                'range': (datetime.datetime(2001, 5,
                                            7), datetime.datetime(2002, 3, 9))
            }
        }
    }
    descriptor_query_dimensions = descriptor_query['dimensions']
    query = Query.from_descriptor_request(descriptor_query)
    search_query = query.search_terms
    assert min(descriptor_query_dimensions['latitude']
               ['range']) == search_query['lat'].begin
    assert max(descriptor_query_dimensions['latitude']
               ['range']) == search_query['lat'].end
    assert min(descriptor_query_dimensions['longitude']
               ['range']) == search_query['lon'].begin
    assert max(descriptor_query_dimensions['longitude']
               ['range']) == search_query['lon'].end
    assert datetime.datetime(2001, 5, 7,
                             tzinfo=tz.tzutc()) == search_query['time'].begin
    assert datetime.datetime(2002, 3, 9,
                             tzinfo=tz.tzutc()) == search_query['time'].end
Пример #7
0
def chop_query_by_time(q: Query, freq: str = "m") -> Iterator[Query]:
    """Given a query over longer period of time, chop it up along the time dimension
    into smaller queries each covering a shorter time period (year, month, week or day).
    """
    qq = dict(**q.search_terms)
    time = qq.pop("time", None)
    if time is None:
        raise ValueError("Need time range in the query")

    for (t0, t1) in time_range(time.begin, time.end, freq=freq):
        yield Query(**qq, time=Range(t0, t1))
Пример #8
0
def chopped_dss(dc: Datacube, freq: str = "m", **query):
    """Emulate streaming interface for datacube queries.

    Basic idea is to perform a lot of smaller queries (shorter time
    periods)
    """
    qq = Query(**query)

    for q in chop_query_by_time(qq, freq=freq):
        dss = dc.find_datasets_lazy(**q.search_terms)
        yield from dss
Пример #9
0
def ordered_dss(dc: Datacube, freq: str = 'm', **query):
    """Emulate "order by time" streaming interface for datacube queries.

        Basic idea is to perform a lot of smaller queries (shorter time
        periods), sort results then yield them to the calling code.
    """
    qq = Query(**query)

    for q in chop_query_by_time(qq, freq=freq):
        dss = dc.find_datasets(**q.search_terms)
        dss.sort(key=lambda ds: ds.center_time)
        yield from dss
Пример #10
0
    def find_datasets(self, limit=None, **search_terms):
        ''' Finds datasets matching the search terms in local index or in GEE catalog.

        Args:
            limit (int): Optional; limit the maximum datasets returned
            search_terms (dict): Search parameters to be passed to datacube.api.query.Query

        Returns: A generated list of datacube.model.Dataset objects.
        '''
        query = Query(**search_terms)
        if query.product and not isinstance(query.product,
                                            datacube.model.DatasetType):
            query.product = self.index.products.get_by_name(query.product)
            query.asset = query.product.metadata_doc.get('properties').get('gee:asset')
        elif search_terms.get('asset'):
            query.product = self.generate_product(**search_terms)
            query.asset = search_terms.pop('asset')

        product_measurements = query.product.measurements.keys()
        if hasattr(query, 'asset'):
            images = self.get_images(self.build_parameters(query))
            for document in generate_documents(query.asset, images, query.product):
                if limit != 0:
                    limit = limit - 1 if limit is not None else limit
                    if set(product_measurements) == set(document['measurements'].keys()):
                        yield datacube.model.Dataset(query.product, document,
                                                     uris=f'EEDAI://{query.asset}')
                else:
                    break
        else:
            for dataset in super().find_datasets(limit=limit, search_terms=search_terms):
                yield dataset
Пример #11
0
def list_gqa_filtered_cells(index, gw, pix_th=None, cell_index=None, **indexers):
    geobox = gw.grid_spec.tile_geobox(cell_index)
    query = Query(index=index, geopolygon=None, **indexers)
    observations = index.datasets.search_eager(**query.search_terms)
    # filter now with pixel threshold value
    datasets = {}
    if pix_th is None:
        pix_th = 1
    print ("pix_th value", str(pix_th))
    for dataset in observations:                                                          
        if check_intersect(geobox.extent, dataset.extent.to_crs(gw.grid_spec.crs)):
            if get_gqa(index, dataset.id) < pix_th:                                  
                #datasets.append(dataset)
                datasets.setdefault(cell_index,{'datasets': [],
                                    'geobox': geobox})['datasets'].append(dataset)
    return gw.cell_sources(datasets, query_group_by(**indexers))
Пример #12
0
def test_convert_descriptor_query_to_search_query_with_groupby():
    descriptor_query = {
        'dimensions': {
            'time': {
                'range': (datetime.datetime(2001, 5,
                                            7), datetime.datetime(2002, 3, 9)),
                'group_by':
                'solar_day'
            }
        }
    }
    query = Query.from_descriptor_request(descriptor_query)
    assert query.group_by
    assert callable(query.group_by.group_by_func)
    assert query.group_by.dimension == 'time'
    assert query.group_by.units == 'seconds since 1970-01-01 00:00:00'
Пример #13
0
def submit(index: Index,
           app_config: str,
           project: str,
           queue: str,
           no_qsub: bool,
           time_range: Tuple[datetime, datetime],
           tag: str):
    _LOG.info('Tag: %s', tag)

    app_config_path = Path(app_config).resolve()
    app_config = paths.read_document(app_config_path)

    task_desc, task_path = init_task_app(
        job_type="fc",
        source_products=[app_config['source_product']],
        output_products=[app_config['output_product']],
        # TODO: Use @datacube.ui.click.parsed_search_expressions to allow params other than time from the cli?
        datacube_query_args=Query(index=index, time=time_range).search_terms,
        app_config_path=app_config_path,
        pbs_project=project,
        pbs_queue=queue
    )
    _LOG.info("Created task description: %s", task_path)

    if no_qsub:
        _LOG.info('Skipping submission due to --no-qsub')
        return 0

    submit_subjob(
        name='generate',
        task_desc=task_desc,
        command=[
            'generate', '-v', '-v',
            '--task-desc', str(task_path),
            '--tag', tag
        ],
        qsub_params=dict(
            mem='20G',
            wd=True,
            ncpus=1,
            walltime='1h',
            name='fc-generate-{}'.format(tag)
        )
    )
Пример #14
0
def ordered_dss(dc: Datacube, freq: str = "m", key=None, **query):
    """Emulate "order by time" streaming interface for datacube queries.

        Basic idea is to perform a lot of smaller queries (shorter time
        periods), sort results then yield them to the calling code.

    :param dc: Datacube instance

    :param freq: 'm' month sized chunks, 'w' week sized chunks, 'd' day

    :param key: Optional sorting function Dataset -> Comparable, for example
                ``lambda ds: (ds.center_time, ds.metadata.region_code)``
    """
    qq = Query(**query)
    if key is None:
        key = lambda ds: ds.center_time

    for q in chop_query_by_time(qq, freq=freq):
        dss = dc.find_datasets(**q.search_terms)
        dss.sort(key=key)
        yield from dss
Пример #15
0
def test_convert_descriptor_query_to_search_query_with_crs_conversion():
    descriptor_query = {
        'dimensions': {
            'latitude': {
                'range': (-3971790.0737348166, -4101004.3359463234),
                'crs': 'EPSG:3577',
            },
            'longitude': {
                'range': (1458629.8414059384, 1616407.8831088375),
                'crs': 'EPSG:3577',
            }
        }
    }
    expected_result = {
        'lat': Range(-36.6715565808, -35.3276413143),
        'lon': Range(148.145408153, 150.070966341),
    }
    query = Query.from_descriptor_request(descriptor_query)
    search_query = query.search_terms
    assert all(map(isclose, search_query['lat'], expected_result['lat']))
    assert all(map(isclose, search_query['lon'], expected_result['lon']))
Пример #16
0
    def query(self, dc: Datacube,
              **search_terms: Dict[str, Any]) -> VirtualDatasetBag:
        product = dc.index.products.get_by_name(self._product)
        if product is None:
            raise VirtualProductException("could not find product {}".format(
                self._product))

        merged_terms = merge_search_terms(
            reject_keys(self, self._NON_QUERY_KEYS),
            reject_keys(search_terms, self._NON_QUERY_KEYS))

        query = Query(
            dc.index, **reject_keys(merged_terms,
                                    self._ADDITIONAL_SEARCH_KEYS))
        self._assert(
            query.product == self._product,
            "query for {} returned another product {}".format(
                self._product, query.product))

        return VirtualDatasetBag(dc.find_datasets(**merged_terms),
                                 query.geopolygon, {product.name: product})
Пример #17
0
def run_query(query, config=None, max_datasets=None):
    """
    Load and return the data.

    :param dict query: Query.
    :param str config: Datacube config filepath or None.

    :return: Data.
    :rtype: xarray.Dataset

    :raise NoDataError: No data found for query

    """

    # noinspection PyTypeChecker
    dc = datacube.Datacube(config=config, app='QGIS Plugin')

    test_query = {
        k: query[k]
        for k in ('product', 'time', 'x', 'y', 'crs') if k in query
    }
    test_query = Query(**test_query)
    datasets = dc.index.datasets.search_eager(**test_query.search_terms)

    if not datasets:
        raise NoDataError('No datasets found for query:\n{}'.format(
            str(query)))
    elif max_datasets and len(datasets) > max_datasets:
        msg = (
            'Number of datasets found ({}) exceeds maximum allowed ({}).\n'
            'Reduce your temporal or spatial extent, or increase the maximum in Settings.'
        )
        raise TooManyDatasetsError(msg.format(len(datasets), max_datasets))

    data = dc.load(**query)

    if not data.variables:
        raise NoDataError('No data found for query:\n{}'.format(str(query)))

    return data
Пример #18
0
def test_convert_descriptor_query_to_search_query_with_single_value():
    descriptor_query = {
        'dimensions': {
            'latitude': {
                'range': -3971790.0737348166,
                'crs': 'EPSG:3577',
            },
            'longitude': {
                'range': 1458629.8414059384,
                'crs': 'EPSG:3577',
            }
        }
    }
    expected_lat = -35.5160921229
    expected_lon = 148.145408153
    query = Query.from_descriptor_request(descriptor_query)
    search_query = query.search_terms
    assert min(*search_query['lat']) <= expected_lat <= max(
        *search_query['lat'])
    assert search_query['lat'].begin != search_query['lat'].end
    assert min(*search_query['lon']) <= expected_lon <= max(
        *search_query['lon'])
    assert search_query['lon'].begin != search_query['lon'].end
Пример #19
0
def test_convert_descriptor_query_to_search_query_with_slices():
    descriptor_query = {
        'dimensions': {
            'latitude': {
                'range': (-35.5, -36.5),
                'array_range': (100, 200)
            },
            'longitude': {
                'range': (148.3, 149.9),
                'array_range': (100, 200)
            },
            'time': {
                'range': (datetime.datetime(2001, 5,
                                            7), datetime.datetime(2002, 3, 9)),
                'array_range': (5, 10)
            }
        }
    }
    query = Query.from_descriptor_request(descriptor_query)
    assert query.slices
    assert query.slices['latitude'] == slice(100, 200)
    assert query.slices['longitude'] == slice(100, 200)
    assert query.slices['time'] == slice(5, 10)
Пример #20
0
def test_descriptor_handles_bad_input():
    with pytest.raises(ValueError):
        descriptor_query = "Not a descriptor"
        Query.from_descriptor_request(descriptor_query)

    with pytest.raises(ValueError):
        descriptor_query = ["Not a descriptor"]
        Query.from_descriptor_request(descriptor_query)

    with pytest.raises(ValueError):
        descriptor_query = {
            'dimensions': {
                'latitude': {
                    'range': -35,
                    'crs': 'EPSG:4326',
                },
                'longitude': {
                    'range': 1458629.8414059384,
                    'crs': 'EPSG:3577',
                }
            }
        }
        Query.from_descriptor_request(descriptor_query)
Пример #21
0
def test_time_handling(time_param, expected):
    query = Query(time=time_param)
    assert 'time' in query.search_terms
    assert query.search_terms['time'] == expected
Пример #22
0
def _dataset_count(index, **query):
    """Return number of datasets matching a query."""
    return index.datasets.count(**Query(**query).search_terms)
Пример #23
0
def _query_polygon(**kw):
    return Query(**kw).geopolygon
Пример #24
0
def submit(index: Index, app_config: str, project: str, queue: str,
           no_qsub: bool, time_range: Tuple[datetime, datetime], tag: str,
           email_options: str, email_id: str, dry_run: bool):
    """
    Kick off two stage PBS job

    Stage 1 (Generate task file):
        The task-app machinery loads a config file, from a path specified on the
        command line, into a dict.

        If dry is enabled, a dummy DatasetType is created for tasks generation without indexing
        the product in the database.
        If dry run is disabled, generate tasks into file and queue PBS job to process them.

    Stage 2 (Run):
        During normal run, following are performed:
           1) Tasks shall be yielded for dispatch to workers.
           2) Load data
           3) Run FC algorithm
           4) Attach metadata
           5) Write output files and
           6) Finally index the newly created FC output netCDF files

        If dry run is enabled, application only prepares a list of output files to be created and does not
        record anything in the database.
    """
    _LOG.info('Tag: %s', tag)

    app_config_path = Path(app_config).resolve()
    app_config = paths.read_document(app_config_path)

    if not time_range or not all(time_range):
        query_args = Query(index=index).search_terms
    else:
        query_args = Query(index=index, time=time_range).search_terms

    task_desc, task_path = init_task_app(
        job_type="fc",
        source_products=[app_config['source_product']],
        output_products=[app_config['output_product']],
        # TODO: Use @datacube.ui.click.parsed_search_expressions to allow params other than time from the cli?
        datacube_query_args=query_args,
        app_config_path=app_config_path,
        pbs_project=project,
        pbs_queue=queue)
    _LOG.info("Created task description: %s", task_path)

    if no_qsub:
        _LOG.info('Skipping submission due to --no-qsub')
        return 0

    # If dry run is not enabled just pass verbose option
    dry_run_option = '--dry-run' if dry_run else '-v'
    extra_qsub_args = '-M {0} -m {1}'.format(email_id, email_options)

    # Append email options and email id to the PbsParameters dict key, extra_qsub_args
    task_desc.runtime_state.pbs_parameters.extra_qsub_args.extend(
        extra_qsub_args.split(' '))

    submit_subjob(name='generate',
                  task_desc=task_desc,
                  command=[
                      'generate',
                      '-vv',
                      '--task-desc',
                      str(task_path),
                      '--tag',
                      tag,
                      '--log-queries',
                      '--email-id',
                      email_id,
                      '--email-options',
                      email_options,
                      dry_run_option,
                  ],
                  qsub_params=dict(name='fc-generate-{}'.format(tag),
                                   mem='medium',
                                   wd=True,
                                   nodes=1,
                                   walltime='1h'))
Пример #25
0
def test_query_multiple_products(mock_index):
    q = Query(index=mock_index, product=['ls5_nbar_albers', 'ls7_nbar_albers'])
    assert q.product == ['ls5_nbar_albers', 'ls7_nbar_albers']
Пример #26
0
def dataset_count(index, **query):
    return index.datasets.count(**Query(**query).search_terms)
Пример #27
0
def test_query_issue_1146():
    q = Query(k='AB')
    assert q.search['k'] == 'AB'
Пример #28
0
def test_dateline_query_building():
    lon = Query(x=(618300, 849000), y=(-1876800, -1642500),
                crs='EPSG:32660').search_terms['lon']

    assert lon.begin < 180 < lon.end