def test_index_dataset_with_sources(index, default_metadata_type):
    type_ = index.products.add_document(_pseudo_telemetry_dataset_type)

    parent_doc = _telemetry_dataset.copy()
    parent = Dataset(type_, parent_doc, None, sources={})
    child_doc = _telemetry_dataset.copy()
    child_doc['lineage'] = {'source_datasets': {'source': _telemetry_dataset}}
    child_doc['id'] = '051a003f-5bba-43c7-b5f1-7f1da3ae9cfb'
    child = Dataset(type_,
                    child_doc,
                    local_uri=None,
                    sources={'source': parent})

    with pytest.raises(MissingRecordError):
        index.datasets.add(child, sources_policy='skip')

    index.datasets.add(child, sources_policy='ensure')
    assert index.datasets.get(parent.id)
    assert index.datasets.get(child.id)

    index.datasets.add(child, sources_policy='skip')
    index.datasets.add(child, sources_policy='ensure')
    index.datasets.add(child, sources_policy='verify')
    # Deprecated property, but it should still work until we remove it completely.
    index.datasets.add(child, sources_policy='skip')

    parent_doc['platform'] = {'code': 'LANDSAT_9'}
    index.datasets.add(child, sources_policy='ensure')
    index.datasets.add(child, sources_policy='skip')

    with pytest.raises(DocumentMismatchError):
        index.datasets.add(child, sources_policy='verify')
def test_index_dataset_with_location(index, default_metadata_type):
    """
    :type index: datacube.index._api.Index
    :type default_collection: datacube.model.DatasetType
    """
    first_file = '/tmp/first/something.yaml'
    second_file = '/tmp/second/something.yaml'
    type_ = index.datasets.types.add_document(_pseudo_telemetry_dataset_type)
    dataset = Dataset(type_, _telemetry_dataset, Path(first_file).absolute().as_uri())
    dataset = index.datasets.add(dataset)

    assert dataset.id == _telemetry_uuid
    # TODO: Dataset types?
    assert dataset.type.id == type_.id
    assert dataset.metadata_type.id == default_metadata_type.id

    assert dataset.local_path.absolute() == Path(first_file).absolute()

    # Ingesting again should have no effect.
    index.datasets.add(dataset)
    locations = index.datasets.get_locations(dataset)
    assert len(locations) == 1

    first_as_uri = Path(first_file).absolute().as_uri()
    second_as_uri = Path(second_file).absolute().as_uri()

    # Ingesting with a new path should add the second one too.
    dataset.local_uri = second_as_uri
    index.datasets.add(dataset)
    locations = index.datasets.get_locations(dataset)
    assert len(locations) == 2
    # Newest to oldest.
    assert locations == [second_as_uri, first_as_uri]
    # And the second one is newer, so it should be returned as the default local path:
    assert dataset.local_path.absolute() == Path(second_file).absolute()
示例#3
0
def test_index_dataset_with_sources(index, default_metadata_type):
    type_ = index.products.add_document(_pseudo_telemetry_dataset_type)

    parent_doc = _telemetry_dataset.copy()
    parent = Dataset(type_, parent_doc, None, sources={})
    child_doc = _telemetry_dataset.copy()
    child_doc['lineage'] = {'source_datasets': {'source': _telemetry_dataset}}
    child_doc['id'] = '051a003f-5bba-43c7-b5f1-7f1da3ae9cfb'
    child = Dataset(type_, child_doc, sources={'source': parent})

    with pytest.raises(MissingRecordError):
        index.datasets.add(child, with_lineage=False)

    index.datasets.add(child)
    assert index.datasets.get(parent.id)
    assert index.datasets.get(child.id)

    assert len(index.datasets.bulk_get([parent.id, child.id])) == 2

    index.datasets.add(child, with_lineage=False)
    index.datasets.add(child, with_lineage=True)

    parent_doc['platform'] = {'code': 'LANDSAT_9'}
    index.datasets.add(child, with_lineage=True)
    index.datasets.add(child, with_lineage=False)

    # backwards compatibility code path checks, don't use this in normal code
    for p in ('skip', 'ensure', 'verify'):
        index.datasets.add(child, sources_policy=p)
示例#4
0
def mk_sample_dataset(bands,
                      uri='file:///tmp',
                      product_name='sample',
                      format='GeoTiff',
                      timestamp=None,
                      id='3a1df9e0-8484-44fc-8102-79184eab85dd'):
    # pylint: disable=redefined-builtin
    image_bands_keys = 'path layer band'.split(' ')
    measurement_keys = 'dtype units nodata aliases name'.split(' ')

    def with_keys(d, keys):
        return dict((k, d[k]) for k in keys if k in d)

    measurements = [with_keys(m, measurement_keys) for m in bands]
    image_bands = dict((m['name'], with_keys(m, image_bands_keys)) for m in bands)

    ds_type = mk_sample_product(product_name,
                                measurements=measurements)

    if timestamp is None:
        timestamp = '2018-06-29'

    return Dataset(ds_type, {
        'id': id,
        'format': {'name': format},
        'image': {'bands': image_bands},
        'time': timestamp,
    }, uris=[uri])
示例#5
0
def mk_sample_dataset(bands,
                      uri='file:///tmp',
                      product_name='sample',
                      format='GeoTiff',
                      id='12345678123456781234567812345678'):
    image_bands_keys = 'path layer band'.split(' ')
    measurement_keys = 'dtype units nodata aliases name'.split(' ')

    def with_keys(d, keys):
        return dict((k, d[k]) for k in keys if k in d)

    measurements = [with_keys(m, measurement_keys) for m in bands]
    image_bands = dict(
        (m['name'], with_keys(m, image_bands_keys)) for m in bands)

    ds_type = mk_sample_product(product_name, measurements=measurements)

    return Dataset(ds_type, {
        'id': id,
        'format': {
            'name': format
        },
        'image': {
            'bands': image_bands
        }
    },
                   uris=[uri])
示例#6
0
 def mk_ds(zone, datum="GDA94"):
     return Dataset(
         product, {
             "grid_spatial": {
                 "projection": {
                     "zone": zone,
                     "datum": datum,
                     "ellipsoid": "GRS80",
                     "orientation": "NORTH_UP",
                     "geo_ref_points": {
                         "ll": {
                             "x": 537437.5,
                             "y": 5900512.5
                         },
                         "lr": {
                             "x": 781687.5,
                             "y": 5900512.5
                         },
                         "ul": {
                             "x": 537437.5,
                             "y": 6117112.5
                         },
                         "ur": {
                             "x": 781687.5,
                             "y": 6117112.5
                         }
                     },
                     "map_projection": "UTM",
                     "resampling_option": "CUBIC_CONVOLUTION"
                 }
             }
         })
示例#7
0
文件: utils.py 项目: terranis/agdc-v2
def make_dataset(product,
                 sources,
                 extent,
                 center_time,
                 valid_data=None,
                 uri=None,
                 app_info=None):
    """
    Create :class:`datacube.model.Dataset` for the data

    :param DatasetType product: Product the dataset is part of
    :param list[:class:`Dataset`] sources: datasets used to produce the dataset
    :param Geometry extent: extent of the dataset
    :param Geometry valid_data: extent of the valid data
    :param center_time: time of the central point of the dataset
    :param str uri: The uri of the dataset
    :param dict app_info: Additional metadata to be stored about the generation of the product
    :rtype: class:`Dataset`
    """
    document = {}
    merge(document, product.metadata_doc)
    merge(document, new_dataset_info())
    merge(document, machine_info())
    merge(document, band_info(product.measurements.keys()))
    merge(document, source_info(sources))
    merge(document, geobox_info(extent, valid_data))
    merge(document, time_info(center_time))
    merge(document, app_info or {})

    return Dataset(
        product,
        document,
        local_uri=uri,
        sources={str(idx): dataset
                 for idx, dataset in enumerate(sources)})
示例#8
0
    def resolve_no_lineage(ds, uri):
        doc = ds.doc_without_lineage_sources
        try:
            product = match_product(doc)
        except BadMatch as e:
            return None, e

        return Dataset(product, doc, uris=[uri], sources={}), None
示例#9
0
def match_dataset(dataset_doc, uri, rules):
    """
    :rtype datacube.model.Dataset:
    """
    rule = match_doc(rules, dataset_doc)
    sources = {cls: match_dataset(source_doc, None, rules)
               for cls, source_doc in rule['type'].dataset_reader(dataset_doc).sources.items()}
    return Dataset(rule['type'], dataset_doc, uri, sources=sources)
示例#10
0
def create_dataset(dataset_doc, uri, rules):
    """
    :rtype datacube.model.Dataset:
    """
    dataset_type = find_matching_product(rules, dataset_doc)
    sources = {cls: create_dataset(source_doc, None, rules)
               for cls, source_doc in dataset_type.dataset_reader(dataset_doc).sources.items()}
    return Dataset(dataset_type, dataset_doc, uris=[uri] if uri else None, sources=sources)
示例#11
0
def doc2ds(doc, products):
    if doc is None:
        return None

    p = products.get(doc['product'], None)
    if p is None:
        raise ValueError('No product named: %s' % doc['product'])
    return Dataset(p, doc['metadata'], uris=doc['uris'])
def _build_dataset(doc):
    sources = {
        name: _build_dataset(src)
        for name, src in doc['lineage']['source_datasets'].items()
    }
    return Dataset(_EXAMPLE_DATASET_TYPE,
                   doc,
                   uris=['file://test.zzz'],
                   sources=sources)
示例#13
0
def doc2ds(doc: Optional[Document],
           products: Dict[str, DatasetType]) -> Optional[Dataset]:
    if doc is None:
        return None

    p = products.get(doc['product'], None)
    if p is None:
        raise ValueError('No product named: %s' % doc['product'])
    return Dataset(p, doc['metadata'], uris=doc['uris'])
示例#14
0
def doc2ds(
    doc: Optional[Document], products: Dict[str, DatasetType]
) -> Optional[Dataset]:
    if doc is None:
        return None

    p = products.get(doc["product"], None)
    if p is None:
        raise ValueError("No product named: %s" % doc["product"])
    return Dataset(p, doc["metadata"], uris=doc["uris"])
示例#15
0
    def _make(self, dataset_res, full_info=False):
        """
        :rtype datacube.model.Dataset

        :param bool full_info: Include all available fields
        """
        return Dataset(self.types.get(dataset_res.dataset_type_ref),
                       dataset_res.metadata,
                       dataset_res.local_uri,
                       indexed_by=dataset_res.added_by if full_info else None,
                       indexed_time=dataset_res.added if full_info else None)
示例#16
0
def test_multiband_support_in_datasetsource(example_gdal_path):
    defn = {
        "id": '12345678123456781234567812345678',
        "format": {
            "name": "GeoTiff"
        },
        "image": {
            "bands": {
                'green': {
                    'type': 'reflective',
                    'cell_size': 25.0,
                    'path': example_gdal_path,
                    'label': 'Coastal Aerosol',
                    'number': '1',
                },
            }
        }
    }

    # Without new band attribute, default to band number 1
    d = Dataset(_EXAMPLE_DATASET_TYPE, defn, uris=['file:///tmp'])

    ds = RasterDatasetDataSource(BandInfo(d, 'green'))

    bandnum = ds.get_bandnumber(None)

    assert bandnum == 1

    with ds.open() as foo:
        data = foo.read()
        assert isinstance(data, np.ndarray)

    #############
    # With new 'image.bands.[band].band' attribute
    band_num = 3
    defn['image']['bands']['green']['band'] = band_num
    d = Dataset(_EXAMPLE_DATASET_TYPE, defn, uris=['file:///tmp'])

    ds = RasterDatasetDataSource(BandInfo(d, 'green'))

    assert ds.get_bandnumber(None) == band_num
示例#17
0
def test_add_eo3(sample_doc, sample_doc_180, eo3_product):
    doc = add_eo3_parts(sample_doc)
    assert doc is not sample_doc
    ds = Dataset(eo3_product, doc)
    assert ds.crs == 'EPSG:3857'
    assert ds.extent is not None
    assert ds.extent.crs == 'EPSG:3857'
    assert ds.metadata.lat.begin < ds.metadata.lat.end
    assert ds.metadata.lon.begin < ds.metadata.lon.end

    doc = dict(**sample_doc, geometry=ds.extent.buffer(-1).json)

    ds2 = Dataset(eo3_product, add_eo3_parts(doc))
    assert ds2.crs == 'EPSG:3857'
    assert ds2.extent is not None
    assert ds2.extent.crs == 'EPSG:3857'
    assert ds2.metadata.lat.begin < ds2.metadata.lat.end
    assert ds2.metadata.lon.begin < ds2.metadata.lon.end
    assert ds.extent.contains(ds2.extent)

    doc = add_eo3_parts(sample_doc_180)
    assert doc is not sample_doc_180
    ds = Dataset(eo3_product, doc)
    assert ds.crs == 'EPSG:32660'
    assert ds.extent is not None
    assert ds.extent.crs == 'EPSG:32660'
    assert ds.metadata.lat.begin < ds.metadata.lat.end
    assert ds.metadata.lon.begin < 180 < ds.metadata.lon.end

    doc = dict(**sample_doc)
    doc.pop('crs')
    with pytest.raises(ValueError):
        add_eo3_parts(doc)

    doc = dict(**sample_doc)
    doc.pop('grids')
    with pytest.raises(ValueError):
        add_eo3_parts(doc)

    with pytest.raises(ValueError):
        eo3_lonlat_bbox({})
示例#18
0
    def mk_dataset(ds, uri):
        uuid = ds.id

        if uuid is None:
            return None, None, "Metadata document it missing id field"

        existing = index.datasets.get(uuid)
        if existing is None:
            return None, None, "No such dataset in the database: {}".format(uuid)

        return Dataset(existing.type,
                       ds.doc_without_lineage_sources,
                       uris=[uri]), existing, None
示例#19
0
def test_multiband_support_in_datasetsource():
    defn = {
        "id": '12345678123456781234567812345678',
        "format": {"name": "hdf"},
        'measurements': {'green': {'nodata': -999}},
        "image": {
            "bands": {
                'green': {
                    'type': 'reflective',
                    'cell_size': 25.0,
                    'path': 'product/LS8_OLITIRS_NBAR_P54_GALPGS01-002_112_079_20140126_B1.tif',
                    'label': 'Coastal Aerosol',
                    'number': '1',
                },
            }
        }
    }

    # Without new band attribute, default to band number 1
    d = Dataset(_EXAMPLE_DATASET_TYPE, defn, uris=['file:///tmp'])

    ds = RasterDatasetSource(d, measurement_id='green')

    bandnum = ds.get_bandnumber(None)

    assert bandnum == 1

    #############
    # With new 'image.bands.[band].band' attribute
    band_num = 3
    defn['image']['bands']['green']['band'] = band_num
    d = Dataset(_EXAMPLE_DATASET_TYPE, defn, uris=['file:///tmp'])

    ds = RasterDatasetSource(d, measurement_id='green')

    assert ds.get_bandnumber(None) == band_num
示例#20
0
 def to_dc_dataset(
     dc: Datacube,
     rendered: Dict[str, Any],
     ds_type: Optional[DatasetType] = None,
     transform: Callable = stac_transform,
     product_name: str = "crop_mask",
 ) -> Dataset:
     """ "
     Stac transformed
     """
     if not ds_type:
         ds_type = dict(
             (d.name, d)
             for d in dc.index.datasets.types.get_all())[product_name]
     return Dataset(ds_type, transform(rendered))
示例#21
0
    def _make(self, dataset_res, full_info=False):
        """
        :rtype Dataset

        :param bool full_info: Include all available fields
        """
        uris = dataset_res.uris
        if uris:
            uris = [uri for uri in uris if uri] if uris else []
        return Dataset(type_=self.types.get(dataset_res.dataset_type_ref),
                       metadata_doc=dataset_res.metadata,
                       uris=uris,
                       indexed_by=dataset_res.added_by if full_info else None,
                       indexed_time=dataset_res.added if full_info else None,
                       archived_time=dataset_res.archived)
示例#22
0
    def _make(self, dataset_res, full_info=False):
        """
        :rtype datacube.model.Dataset

        :param bool full_info: Include all available fields
        """
        uri = dataset_res.uri
        return Dataset(
            self.types.get(dataset_res.dataset_type_ref),
            dataset_res.metadata,
            # We guarantee that this property on the class is only a local uri.
            uri if uri and uri.startswith('file:') else None,
            indexed_by=dataset_res.added_by if full_info else None,
            indexed_time=dataset_res.added if full_info else None,
            archived_time=dataset_res.archived)
示例#23
0
    def mk_dataset(ds, uri):
        uuid = ds.id

        if uuid is None:
            return None, None, "Metadata document it missing id field"

        existing = index.datasets.get(uuid)
        if existing is None:
            return None, None, "No such dataset in the database: {}".format(uuid)

        ds = SimpleDocNav(prep_eo3(ds.doc, auto_skip=True))

        # TODO: what about sources=?
        return Dataset(existing.type,
                       ds.doc_without_lineage_sources,
                       uris=[uri]), existing, None
示例#24
0
文件: hl.py 项目: zs856/datacube-core
        def resolve_ds(ds, sources, cache=None):
            cached = cache.get(ds.id)
            if cached is not None:
                return cached

            uris = [uri] if ds.id == main_uuid else []

            doc = ds.doc

            db_ds = db_dss.get(ds.id)
            if db_ds:
                product = db_ds.type
            else:
                product = match_product(doc)

            return with_cache(Dataset(product, doc, uris=uris, sources=sources), ds.id, cache)
示例#25
0
def mk_sample_dataset(bands,
                      uri='file:///tmp',
                      product_name='sample',
                      format='GeoTiff',
                      timestamp=None,
                      id='3a1df9e0-8484-44fc-8102-79184eab85dd',
                      geobox=None,
                      product_opts=None):
    # pylint: disable=redefined-builtin
    image_bands_keys = 'path layer band'.split(' ')
    measurement_keys = 'dtype units nodata aliases name'.split(' ')

    def with_keys(d, keys):
        return dict((k, d[k]) for k in keys if k in d)

    measurements = [with_keys(m, measurement_keys) for m in bands]
    image_bands = dict(
        (m['name'], with_keys(m, image_bands_keys)) for m in bands)

    if product_opts is None:
        product_opts = {}

    ds_type = mk_sample_product(product_name,
                                measurements=measurements,
                                **product_opts)

    if timestamp is None:
        timestamp = '2018-06-29'
    if uri is None:
        uris = []
    elif isinstance(uri, list):
        uris = uri.copy()
    else:
        uris = [uri]

    return Dataset(ds_type, {
        'id': id,
        'format': {
            'name': format
        },
        'image': {
            'bands': image_bands
        },
        'time': timestamp,
        **geobox_to_gridspatial(geobox),
    },
                   uris=uris)
示例#26
0
def generate_dataset(data, sources, prod_info, uri):
    nudata = data.copy()

    datasets = []
    for idx, (time,
              sources) in enumerate(zip(sources.time.values, sources.values)):
        document = {
            'id': str(uuid.uuid4()),
            'image': {
                'bands': {
                    name: {
                        'path': '',
                        'layer': name
                    }
                    for name in nudata.data_vars
                }
            },
            'lineage': {
                'source_datasets': {
                    str(idx): dataset.metadata_doc
                    for idx, dataset in enumerate(sources)
                }
            }
        }
        # TODO: extent is a bad thing to store - it duplicates coordinates
        set_geobox_info(document, data.crs, data.extent)
        document['extent']['from_dt'] = str(time)
        document['extent']['to_dt'] = str(time)
        document['extent']['center_dt'] = str(time)
        document.update(prod_info.metadata)
        dataset = Dataset(
            prod_info,
            document,
            local_uri=uri,
            sources={str(idx): dataset
                     for idx, dataset in enumerate(sources)})
        datasets.append(dataset)
    nudata['dataset'] = (['time'],
                         numpy.array([
                             yaml.dump(dataset.metadata_doc,
                                       Dumper=SafeDumper,
                                       encoding='utf-8')
                             for dataset in datasets
                         ],
                                     dtype='S'))
    return nudata, datasets
示例#27
0
        def on_success(dataset: DatasetDoc, dataset_path: Path):
            """
            Index the dataset
            """
            product_name = dataset.product.name
            product = products.get(product_name)
            if not product:
                product = index.products.get_by_name(product_name)
                if not product:
                    raise ValueError(
                        f"Product {product_name} not found in ODC index")
                products[product_name] = product

            index.datasets.add(
                Dataset(product,
                        serialise.to_doc(dataset),
                        uris=dataset.locations))
            _LOG.debug("Indexed dataset",
                       dataset_id=dataset.id,
                       dataset_path=dataset_path)
示例#28
0
def all_datasets(dc: Datacube,
                 product: str,
                 read_chunk: int = 1000,
                 limit: Optional[int] = None):
    """
    Like dc.find_datasets_lazy(product=product) but actually lazy, using db cursors
    """
    import psycopg2
    from random import randint

    assert isinstance(limit, (int, type(None)))

    db = psycopg2.connect(str(dc.index.url))
    _limit = "" if limit is None else f"LIMIT {limit}"

    _product = dc.index.products.get_by_name(product)

    query = f"""select
jsonb_build_object(
  'product', %(product)s,
  'uris', array((select _loc_.uri_scheme ||':'||_loc_.uri_body
                 from agdc.dataset_location as _loc_
                 where _loc_.dataset_ref = agdc.dataset.id and _loc_.archived is null
                 order by _loc_.added desc, _loc_.id desc)),
  'metadata', metadata) as dataset
from agdc.dataset
where archived is null
and dataset_type_ref = (select id from agdc.dataset_type where name = %(product)s)
{_limit};
"""
    cursor_name = "c{:04X}".format(randint(0, 0xFFFF))
    with db.cursor(name=cursor_name) as cursor:
        cursor.execute(query, dict(product=product))

        while True:
            chunk = cursor.fetchmany(read_chunk)
            if not chunk:
                break
            for (ds, ) in chunk:
                yield Dataset(_product, ds["metadata"], ds["uris"])
示例#29
0
    def update(self, dataset: Dataset, updates_allowed=None):
        """
        Update dataset metadata and location
        :param Dataset dataset: Dataset to update
        :param updates_allowed: Allowed updates
        :rtype: Dataset
        """
        existing = self.get(dataset.id)
        can_update, safe_changes, unsafe_changes = self.can_update(
            dataset, updates_allowed)

        if not safe_changes and not unsafe_changes:
            self._ensure_new_locations(dataset, existing)
            _LOG.info("No changes detected for dataset %s", dataset.id)
            return dataset

        for offset, old_val, new_val in safe_changes:
            _LOG.info("Safe change in %s from %r to %r",
                      _readable_offset(offset), old_val, new_val)

        for offset, old_val, new_val in unsafe_changes:
            _LOG.warning("Unsafe change in %s from %r to %r",
                         _readable_offset(offset), old_val, new_val)

        if not can_update:
            raise ValueError(f"Unsafe changes in {dataset.id}: " + (", ".join(
                _readable_offset(offset) for offset, _, _ in unsafe_changes)))

        _LOG.info("Updating dataset %s", dataset.id)

        product = self.types.get_by_name(dataset.type.name)
        with self._db.begin() as transaction:
            if not transaction.update_dataset(
                    dataset.metadata_doc_without_lineage(), dataset.id,
                    product.id):
                raise ValueError("Failed to update dataset %s..." % dataset.id)

        self._ensure_new_locations(dataset, existing)

        return dataset
示例#30
0
def add_dataset(pr, dt, metadict, file):
    """Add a dataset to the datacube database

    It's added to 2 tables:
      - dataset: with all the metadata
      - dataset_location

    Args:
        pr (ProductResource): A ProductResource object, contained in the return of
            ``add_product``
        dt (DatasetType): A DatasetType object, contained in the return of ``add_product``
        metadict (dict): Dictionary containing dataset metadata, generally generated
            by ``metadict_from_netcdf``
        file (str): Path of the file to add to the index

    Return:
        No return, the function is used for its side effect of adding a dataset to the datacube
    """
    db = PostgresDb.from_config(CONFIG)
    dataset_resource = DatasetResource(db, pr)
    dataset = Dataset(dt, metadict, sources={})
    dataset_resource.add(dataset)
    uid = metadict['id']
    dataset_resource.add_location(uid, file)