def munge_metadata(nci_dataset): del nci_dataset['image']['bands']['nbart_blue'] del nci_dataset['image']['bands']['nbart_coastal_aerosol'] del nci_dataset['image']['bands']['nbart_contiguity'] del nci_dataset['image']['bands']['nbart_green'] del nci_dataset['image']['bands']['nbart_nir_1'] del nci_dataset['image']['bands']['nbart_nir_2'] del nci_dataset['image']['bands']['nbart_red'] del nci_dataset['image']['bands']['nbart_red_edge_1'] del nci_dataset['image']['bands']['nbart_red_edge_2'] del nci_dataset['image']['bands']['nbart_red_edge_3'] del nci_dataset['image']['bands']['nbart_swir_2'] del nci_dataset['image']['bands']['nbart_swir_3'] del nci_dataset['lineage'] nci_dataset['creation_dt'] = nci_dataset['extent']['center_dt'] # FIXME: WTF nci_dataset['product_type'] = 'S2MSIARD_NBAR' nci_dataset['original_id'] = nci_dataset['id'] nci_dataset['software_versions'].update({ 's2_to_s3_rolling': { # FIXME: Update 'repo': 'https://github.com/GeoscienceAustralia/dea-airflow/', 'version': '1.0.0'} }) # Create a deterministic dataset ID based on these inputs nci_dataset['id'] = str(odc_uuid("s2_to_s3_rolling", "1.0.0", [nci_dataset['id']])) return nci_dataset
def _deterministic_uuid(self, task, algorithm_version=None, **other_tags): if algorithm_version is None: transform_info = self._get_transform_info() algorithm_version = transform_info["version_major_minor"] if "dataset_version" not in other_tags: try: other_tags["dataset_version"] = task.settings.output.metadata[ "dataset_version"] except KeyError: _LOG.info( "dataset_version not set and not used to generate deterministic uuid" ) uuid = odc_uuid( algorithm=task.settings.specification.transform, algorithm_version=algorithm_version, sources=[task.dataset.id], **other_tags, ) uuid_values = other_tags.copy() uuid_values["algorithm_version"] = algorithm_version uuid_values["dataset.id"] = task.dataset.id uuid_values["algorithm"] = task.settings.specification.transform return uuid, uuid_values
def __post_init__(self): self.short_time = self.time_range.short if self.uuid.int == 0: self.uuid = odc_uuid(self.product.name, self.product.version, sources=self._lineage(), time=self.short_time, tile=self.tile_index)
def replace_metadata(yaml_file, _s3_bucket, s3_metadata_path): """ Replace metadata with additional info :param yaml_file: metadata file in NCI :param _s3_bucket: name of s3 bucket :param s3_metadata_path: path of metadata file in s3 """ s3_resource = boto3.resource("s3").Bucket(_s3_bucket) with open(yaml_file) as config_file: temp_metadata = yaml.load(config_file, Loader=yaml.CSafeLoader) del temp_metadata['image']['bands']['nbart_blue'] del temp_metadata['image']['bands']['nbart_coastal_aerosol'] del temp_metadata['image']['bands']['nbart_contiguity'] del temp_metadata['image']['bands']['nbart_green'] del temp_metadata['image']['bands']['nbart_nir_1'] del temp_metadata['image']['bands']['nbart_nir_2'] del temp_metadata['image']['bands']['nbart_red'] del temp_metadata['image']['bands']['nbart_red_edge_1'] del temp_metadata['image']['bands']['nbart_red_edge_2'] del temp_metadata['image']['bands']['nbart_red_edge_3'] del temp_metadata['image']['bands']['nbart_swir_2'] del temp_metadata['image']['bands']['nbart_swir_3'] del temp_metadata['lineage'] temp_metadata['creation_dt'] = temp_metadata['extent']['center_dt'] temp_metadata['product_type'] = 'S2MSIARD_NBAR' temp_metadata['original_id'] = temp_metadata['id'] temp_metadata['software_versions'].update({ 's2_to_s3_rolling': { 'repo': 'https://github.com/GeoscienceAustralia/dea-airflow/', 'version': '1.0.0' } }) # Create dataset ID based on Kirill's magic temp_metadata['id'] = str( odc_uuid("s2_to_s3_rolling", "1.0.0", [temp_metadata['id']])) # Write to S3 directly s3_resource.Object(key=s3_metadata_path).put(Body=yaml.dump( temp_metadata, default_flow_style=False, Dumper=yaml.CSafeDumper)) LOG.info("Finished uploaded metadata %s to %s", yaml_file, s3_metadata_path)
def generate_yaml(path_or_url, cfg): src = rasterio.open(path_or_url) path = path_or_url.split("/")[-1] region_code = path.split(".")[0].split("-")[1] info = dict( uuid=odc_uuid( cfg.product, cfg.version, sources=[], period=cfg.period, region_code=region_code, ), epsg=src.meta["crs"].to_epsg(), region_code=region_code, shape=src.shape, transform=src.transform, path=path, ) return tpl.render(cfg=cfg, **info)
def deterministic_uuid(task, algorithm_version=None, **other_tags): if algorithm_version is None: transform_info = get_transform_info(task.settings.specification.transform) algorithm_version = transform_info['version_major_minor'] if 'dataset_version' not in other_tags: try: other_tags['dataset_version'] = task.settings.output.metadata['dataset_version'] except KeyError: _LOG.info('dataset_version not set and ' 'not used to generate deterministic uuid') uuid = odc_uuid(algorithm=task.settings.specification.transform, algorithm_version=algorithm_version, sources=[task.dataset.id], **other_tags) uuid_values = other_tags.copy() uuid_values['algorithm_version'] = algorithm_version uuid_values['dataset.id'] = task.dataset.id uuid_values['algorithm'] = task.settings.specification.transform return uuid, uuid_values
def replace_metadata(granule, s3_bucket, s3_metadata_path): s3 = boto3.resource("s3").Bucket(s3_bucket) yaml_file = "{nci_path}/{granule}/ARD-METADATA.yaml".format( nci_path=NCI_DIR, granule=granule ) with open(yaml_file) as config_file: temp_metadata = yaml.load(config_file, Loader=yaml.CSafeLoader) del temp_metadata['image']['bands']['nbart_blue'] del temp_metadata['image']['bands']['nbart_coastal_aerosol'] del temp_metadata['image']['bands']['nbart_contiguity'] del temp_metadata['image']['bands']['nbart_green'] del temp_metadata['image']['bands']['nbart_nir_1'] del temp_metadata['image']['bands']['nbart_nir_2'] del temp_metadata['image']['bands']['nbart_red'] del temp_metadata['image']['bands']['nbart_red_edge_1'] del temp_metadata['image']['bands']['nbart_red_edge_2'] del temp_metadata['image']['bands']['nbart_red_edge_3'] del temp_metadata['image']['bands']['nbart_swir_2'] del temp_metadata['image']['bands']['nbart_swir_3'] del temp_metadata['lineage'] temp_metadata['creation_dt'] = temp_metadata['extent']['center_dt'] temp_metadata['product_type'] = 'S2MSIARD_NBAR' temp_metadata['original_id'] = temp_metadata['id'] temp_metadata['software_versions'].append({ 's2_to_s3_rolling': { 'repo': 'https://github.com/GeoscienceAustralia/dea-orchestration/', 'version': '1.0.0'} }) # Create dataset ID based on Kirill's magic temp_metadata['id'] = str(odc_uuid("s2_to_s3_rolling", "1.0.0", [temp_metadata['id']])) # Write to S3 directly s3.Object(key=s3_metadata_path).put(Body=yaml.dump( temp_metadata, default_flow_style=False, Dumper=yaml.CSafeDumper) )
def stac_transform(input_stac: Document, relative: bool = True) -> Document: """Takes in a raw STAC 1.0 dictionary and returns an ODC dictionary""" product_label, product_name, region_code, default_grid = _stac_product_lookup( input_stac ) # Generating UUID for products not having UUID. # Checking if provided id is valid UUID. # If not valid, creating new deterministic uuid using odc_uuid function based on product_name and product_label. # TODO: Verify if this approach to create UUID is valid. if _check_valid_uuid(input_stac["id"]): deterministic_uuid = input_stac["id"] else: if product_name in ["s2_l2a"]: deterministic_uuid = str( odc_uuid("sentinel-2_stac_process", "1.0.0", [product_label]) ) else: deterministic_uuid = str( odc_uuid(f"{product_name}_stac_process", "1.0.0", [product_label]) ) # TODO: handle old STAC that doesn't have grid information here... bands, grids = _get_stac_bands(input_stac, default_grid, relative=relative) stac_properties, lineage = _get_stac_properties_lineage(input_stac) properties = input_stac["properties"] epsg = properties["proj:epsg"] native_crs = f"epsg:{epsg}" # Transform geometry to the native CRS at an appropriate precision geometry = Geometry(input_stac["geometry"], "epsg:4326") if native_crs != "epsg:4326": # Arbitrary precisions, but should be fine pixel_size = get_in(["default", "transform", 0], grids) precision = 0 if pixel_size < 0: precision = 6 geometry = _geographic_to_projected(geometry, native_crs, precision) stac_odc = { "$schema": "https://schemas.opendatacube.org/dataset", "id": deterministic_uuid, "crs": native_crs, "grids": grids, "product": {"name": product_name.lower()}, "label": product_label, "properties": stac_properties, "measurements": bands, "lineage": {}, } if region_code: stac_odc["properties"]["odc:region_code"] = region_code if geometry: stac_odc["geometry"] = geometry.json if lineage: stac_odc["lineage"] = lineage return stac_odc
def fuse_ds(ds_1: Dataset, ds_2: Dataset, product: Optional[DatasetType] = None) -> Dataset: """ This function fuses two datasets. It requires that: - the products are fusable - grids with the same name are identical - labels are in the format 'product_suffix' with identical suffixes - CRSs' are identical - datetimes are identical - $schemas are identical """ doc_1, doc_2 = ds_1.metadata_doc, ds_2.metadata_doc if product is None: product = fuse_products(ds_1.type, ds_2.type) fused_doc = dict() fused_doc["id"] = str( odc_uuid(product.name, "0.0.0", sources=[doc_1["id"], doc_2["id"]])) fused_doc["lineage"] = {"source_datasets": [doc_1["id"], doc_2["id"]]} # check that all grids with the same name are identical common_grids = set(doc_1["grids"].keys()).intersection( doc_2["grids"].keys()) assert all(doc_1["grids"][g] == doc_2["grids"][g] for g in common_grids) # TODO: handle the case that grids have conflicts in a seperate function fused_doc["grids"] = {**doc_1["grids"], **doc_2["grids"]} label_suffix = doc_1["label"].replace(doc_1["product"]["name"], "") assert label_suffix == doc_2["label"].replace(doc_2["product"]["name"], "") fused_doc["label"] = f"{product.name}{label_suffix}" equal_keys = ["$schema", "crs"] for key in equal_keys: assert doc_1[key] == doc_2[key] fused_doc[key] = doc_1[key] fused_doc["properties"] = dict() assert doc_1["properties"]["datetime"] == doc_2["properties"][ "datetime"] # datetime is the only manditory property # copy over all identical properties for key, val in doc_1["properties"].items(): if val == doc_2["properties"].get(key, None): fused_doc["properties"][key] = val fused_doc["measurements"] = { **doc_1["measurements"], **doc_2["measurements"] } for key, path in { **measurement_paths(ds_1), **measurement_paths(ds_2) }.items(): fused_doc["measurements"][key]["path"] = path fused_ds = Dataset(product, prep_eo3(fused_doc), uris=[""]) return fused_ds