def test_valid_document_works(tmp_path: Path, example_metadata: Dict): generated_doc = dump_roundtrip(example_metadata) # Do a serialisation roundtrip and check that it's still identical. reserialised_doc = dump_roundtrip( serialise.to_doc(serialise.from_doc(generated_doc))) assert_same(generated_doc, reserialised_doc) assert serialise.from_doc(generated_doc) == serialise.from_doc( reserialised_doc)
def assert_unchanged_after_roundstrip(doc: Dict): generated_doc = dump_roundtrip(doc) # Do a serialisation roundtrip and check that it's still identical. reserialised_doc = dump_roundtrip( serialise.to_doc(serialise.from_doc(generated_doc))) # One allowed difference: input dates can be many string formats, # but we normalise them with timezone (UTC default) _normalise_datetime_props(generated_doc) assert serialise.from_doc(generated_doc) == serialise.from_doc( reserialised_doc)
def assert_expected_eo3_path( expected_doc: Dict, expected_path: Path, ignore_fields=(), ): """ Check an output path of an EO3 dataset matches an expected document. This is slightly smarter about doing geometry equality etc within the document. """ __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError) assert (expected_path.exists() ), f"Expected output EO3 path doesn't exist: {expected_path}" assert_same_as_file( expected_doc, expected_path, # We check the geometry below ignore_fields=("geometry", ) + tuple(ignore_fields), ) if "geometry" not in ignore_fields: # Compare geometry after parsing, rather than comparing the raw dict values. produced_dataset = serialise.from_path(expected_path) expected_dataset = serialise.from_doc(expected_doc, skip_validation=True) if expected_dataset.geometry is None: assert produced_dataset.geometry is None else: assert_shapes_mostly_equal(produced_dataset.geometry, expected_dataset.geometry, 0.00000001)
def add_source_path( self, *paths: Path, classifier: str = None, auto_inherit_properties: bool = False, ): """ Record a source dataset using the path to its metadata document. :param paths: See other parameters in :func:`DatasetAssembler.add_source_dataset` """ for _, doc in find_and_read_documents(*paths): # Newer documents declare a schema. if "$schema" in doc: self.add_source_dataset( serialise.from_doc(doc), classifier=classifier, auto_inherit_properties=auto_inherit_properties, ) else: if auto_inherit_properties: raise NotImplementedError( "Can't (yet) inherit properties from old-style metadata" ) classifier = classifier or doc.get("product_type") if not classifier: # TODO: This rule is a little obscure to force people to know. # We could somehow figure out from the product? raise ValueError( "Source dataset (of old-style eo) doesn't have a 'product_type' property (eg. 'level1', 'fc'), " "you must specify a classifier for the kind of source dataset." ) self._lineage[classifier].append(doc["id"])
def process_dataset(s3_obj): s3_eo3_path = s3_obj.url s3_stac_path = s3_eo3_path.replace("eo3", "stac") s3_stac_path = s3_stac_path.replace("yaml", "json") s3_path = s3_eo3_path.replace("eo3-ARD-METADATA.yaml", "") granule = os.path.join(*s3_eo3_path.split('/')[5:-1]) nci_path = os.path.join(NCI_DIR, *s3_eo3_path.split('/')[5:-1], "ARD-METADATA.yaml") if "S2A_OPER_MSI_ARD" in granule: platform = "SENTINEL_2A" elif "S2B_OPER_MSI_ARD" in granule: platform = "SENTINEL_2B" else: raise ValueError( f"Expected granule id to contain either 'S2A_OPER_MSI_ARD' or 'S2B_OPER_MSI_ARD', found '{granule}'" ) with open(nci_path) as fin: eo_metadata = yaml.safe_load(fin) eo3_metadata = yaml.safe_load(s3_obj.data) eo3_metadata["properties"]["odc:region_code"] = eo_metadata["provider"]["reference_code"] eo3_metadata["properties"]["gqa:cep90"] = eo_metadata["gqa"]["residual"]["cep90"] eo3_metadata["properties"]["gqa:error_message"] = eo_metadata["gqa"]["error_message"] eo3_metadata["properties"]["gqa:final_gcp_count"] = eo_metadata["gqa"]["final_gcp_count"] eo3_metadata["properties"]["gqa:ref_source"] = eo_metadata["gqa"]["ref_source"] eo3_metadata["properties"]["sentinel:datatake_start_datetime"] = granule.split("_")[-4] eo3_metadata["properties"]["eo:platform"] = platform eo3_metadata["properties"]["eo:instrument"] = "MSI" for key in ["abs_iterative_mean", "abs", "iterative_mean", "iterative_stddev", "mean", "stddev"]: eo3_metadata["properties"][f"gqa:{key}_xy"] = eo_metadata["gqa"]["residual"][key]["xy"] eo3 = serialise.from_doc(eo3_metadata) stac = to_stac_item( eo3, stac_item_destination_url=s3_stac_path, odc_dataset_metadata_url=s3_eo3_path, dataset_location=s3_path, ) stac_dump = json.dumps(stac, default=json_fallback, indent=4) eo3_dump = yaml.safe_dump(eo3_metadata, default_flow_style=False) s3_dump( eo3_dump, s3_eo3_path, ACL="bucket-owner-full-control", ContentType="text/vnd.yaml", ) s3_dump( stac_dump, s3_stac_path, ACL="bucket-owner-full-control", ContentType="application/json" )
def test_location_single_serialisation(tmp_path: Path, l1_ls8_folder_md_expected: Dict): # Always serialises a single location as 'location' location = "https://some/test/path" # Given multiple l1_ls8_folder_md_expected["locations"] = [location] reserialised_doc = dump_roundtrip( serialise.to_doc(serialise.from_doc(l1_ls8_folder_md_expected))) # We get singular assert reserialised_doc["location"] == location assert "locations" not in reserialised_doc
def get_dataset_file_offsets(dataset: Dataset) -> Dict[str, str]: """ Get (usually relative) paths for all known files of a dataset. Returns {name, url} """ # Get paths to measurements (usually relative, but may not be) uri_list = { name: m["path"] for name, m in dataset.measurements.items() if m.get("path") } # Add accessories too, if possible if is_doc_eo3(dataset.metadata_doc): dataset_doc = serialise.from_doc(dataset.metadata_doc, skip_validation=True) uri_list.update( {name: a.path for name, a in dataset_doc.accessories.items()}) return uri_list
def check_prepare_outputs( invoke_script, run_args, expected_doc: Dict, expected_metadata_path: Path, ignore_fields=(), ): __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError) run_prepare_cli(invoke_script, *run_args) assert expected_metadata_path.exists() assert_same_as_file( expected_doc, expected_metadata_path, # We check the geometry below ignore_fields=("geometry",) + tuple(ignore_fields), ) # Compare geometry after parsing, rather than comparing the raw dict values. produced_dataset = serialise.from_path(expected_metadata_path) expected_dataset = serialise.from_doc(expected_doc, skip_validation=True) assert_shapes_mostly_equal( produced_dataset.geometry, expected_dataset.geometry, 0.00000001 )
def l1_ls8_dataset(l1_ls8_folder_md_expected: Dict) -> DatasetDoc: return serialise.from_doc(l1_ls8_folder_md_expected)
def as_stac_item(dataset: DatasetItem): """ Get a dict corresponding to a stac item """ ds: Dataset = dataset.odc_dataset if ds is not None and is_doc_eo3(ds.metadata_doc): dataset_doc = serialise.from_doc(ds.metadata_doc, skip_validation=True) dataset_doc.locations = ds.uris # Geometry is optional in eo3, and needs to be calculated from grids if missing. # We can use ODC's own calculation that happens on index. if dataset_doc.geometry is None: fallback_extent = ds.extent if fallback_extent is not None: dataset_doc.geometry = fallback_extent.geom dataset_doc.crs = str(ds.crs) if ds.sources: dataset_doc.lineage = { classifier: [d.id] for classifier, d in ds.sources } # Does ODC still put legacy lineage into indexed documents? elif ("source_datasets" in dataset_doc.lineage) and len( dataset_doc.lineage) == 1: # From old to new lineage type. dataset_doc.lineage = { classifier: [dataset["id"]] for classifier, dataset in dataset_doc.lineage["source_datasets"] } else: # eo1 to eo3 dataset_doc = DatasetDoc( id=dataset.dataset_id, # Filled-in below. label=None, product=ProductDoc(dataset.product_name), locations=ds.uris if ds is not None else None, crs=str(dataset.geometry.crs), geometry=dataset.geometry.geom, grids=None, # TODO: Convert these from stac to eo3 properties=Eo3Dict({ "datetime": utc(dataset.center_time), **(dict(_build_properties(ds.metadata)) if ds else {}), "odc:processing_datetime": utc(dataset.creation_time), }), measurements={ name: _band_to_measurement( b, dataset_location=ds.uris[0] if ds is not None and ds.uris else None, ) for name, b in ds.measurements.items() } if ds is not None else {}, accessories=_accessories_from_eo1(ds.metadata_doc) if ds is not None else {}, # TODO: Fill in lineage. The datacube API only gives us full datasets, which is # expensive. We only need a list of IDs here. lineage={}, ) if dataset_doc.label is None and ds is not None: dataset_doc.label = _utils.dataset_label(ds) item_doc = eo3stac.to_stac_item( dataset=dataset_doc, stac_item_destination_url=url_for( ".item", collection=dataset.product_name, dataset_id=dataset.dataset_id, ), odc_dataset_metadata_url=url_for("dataset.raw_doc", id_=dataset.dataset_id), explorer_base_url=url_for("default_redirect"), ) # Add the region code that Explorer inferred. # (Explorer's region codes predate ODC's and support # many more products. item_doc["properties"]["cubedash:region_code"] = dataset.region_code return item_doc
def validate_dataset( doc: Dict, product_definition: Optional[Dict] = None, thorough: bool = False, readable_location: Union[str, Path] = None, expect_extra_measurements: bool = False, ) -> ValidationMessages: """ Validate a a dataset document, optionally against the given product. By default this will only look at the metadata, run with thorough=True to open the data files too. :param product_definition: Optionally check that the dataset matches this product definition. :param thorough: Open the imagery too, to check that data types etc match. :param readable_location: Dataset location to use, if not the metadata path. :param expect_extra_measurements: Allow some dataset measurements to be missing from the product definition. This is (deliberately) allowed by ODC, but often a mistake. This flag disables the warning. """ schema = doc.get("$schema") if schema is None: yield _error( "no_schema", f"No $schema field. " f"You probably want an ODC dataset schema {model.ODC_DATASET_SCHEMA_URL!r}", ) return if schema != model.ODC_DATASET_SCHEMA_URL: yield _error( "unknown_doc_type", f"Unknown doc schema {schema!r}. Only ODC datasets are supported ({model.ODC_DATASET_SCHEMA_URL!r})", ) return has_doc_errors = False for error in serialise.DATASET_SCHEMA.iter_errors(doc): has_doc_errors = True displayable_path = ".".join(error.absolute_path) hint = None if displayable_path == "crs" and "not of type" in error.message: hint = "epsg codes should be prefixed with 'epsg:1234'" context = f"({displayable_path}) " if displayable_path else "" yield _error("structure", f"{context}{error.message} ", hint=hint) if has_doc_errors: return dataset = serialise.from_doc(doc, skip_validation=True) if not dataset.product.href: _info("product_href", "A url (href) is recommended for products") yield from _validate_geo(dataset) # Note that a dataset may have no measurements (eg. telemetry data). # (TODO: a stricter mode for when we know we should have geo and measurement info) if dataset.measurements: for name, measurement in dataset.measurements.items(): grid_name = measurement.grid if grid_name != "default" or dataset.grids: if grid_name not in dataset.grids: yield _error( "invalid_grid_ref", f"Measurement {name!r} refers to unknown grid {grid_name!r}", ) if is_absolute(measurement.path): yield _warning( "absolute_path", f"measurement {name!r} has an absolute path: {measurement.path!r}", ) yield from _validate_stac_properties(dataset) required_measurements: Dict[str, ExpectedMeasurement] = {} if product_definition is not None: required_measurements.update({ m.name: m for m in map( ExpectedMeasurement.from_definition, product_definition.get("measurements") or (), ) }) product_name = product_definition.get("name") if product_name != dataset.product.name: # This is only informational as it's possible products may be indexed with finer-grained # categories than the original datasets: eg. a separate "nrt" product, or test product. yield _info( "product_mismatch", f"Dataset product name {dataset.product.name!r} " f"does not match the given product ({product_name!r}", ) for name in required_measurements: if name not in dataset.measurements.keys(): yield _error( "missing_measurement", f"Product {product_name} expects a measurement {name!r})", ) measurements_not_in_product = set( dataset.measurements.keys()).difference( set(m["name"] for m in product_definition.get("measurements") or ())) if (not expect_extra_measurements) and measurements_not_in_product: things = ", ".join(sorted(measurements_not_in_product)) yield _warning( "extra_measurements", f"Dataset has measurements not present in product definition for {product_name!r}: {things}", hint= "This may be valid, as it's allowed by ODC. Set `expect_extra_measurements` to mute this.", ) # If we have a location: # For each measurement, try to load it. # If loadable: if thorough: for name, measurement in dataset.measurements.items(): full_path = uri_resolve(readable_location, measurement.path) expected_measurement = required_measurements.get(name) band = measurement.band or 1 with rasterio.open(full_path) as ds: ds: DatasetReader if band not in ds.indexes: yield _error( "incorrect_band", f"Measurement {name!r} file contains no rio index {band!r}.", hint=f"contains indexes {ds.indexes!r}", ) continue if not expected_measurement: # The measurement is not in the product definition # # This is only informational because a product doesn't have to define all # measurements that the datasets contain. # # This is historically because dataset documents reflect the measurements that # are stored on disk, which can differ. But products define the set of measurments # that are mandatory in every dataset. # # (datasets differ when, for example, sensors go offline, or when there's on-disk # measurements like panchromatic that GA doesn't want in their product definitions) if required_measurements: yield _info( "unspecified_measurement", f"Measurement {name} is not in the product", ) else: expected_dtype = expected_measurement.dtype band_dtype = ds.dtypes[band - 1] # TODO: NaN handling if expected_dtype != band_dtype: yield _error( "different_dtype", f"{name} dtype: " f"product {expected_dtype!r} != dataset {band_dtype!r}", ) # TODO: the nodata can also be a fill value, as mentioned by Kirill. expected_nodata = expected_measurement.nodata ds_nodata = ds.nodatavals[band - 1] if expected_nodata != ds_nodata and not ( _is_nan(expected_nodata) and _is_nan(ds_nodata)): yield _info( "different_nodata", f"{name} nodata: " f"product {expected_nodata !r} != dataset {ds_nodata !r}", )
def prepare_and_write( dataset_location: Path, output_yaml: Path, producer: str, granule_id: str = None, embed_location: bool = None, ) -> Tuple[DatasetDoc, Path]: if embed_location is None: # Default to embedding the location if they're not in the same folder. embed_location = output_yaml.parent not in dataset_location.parents _LOG.debug( "Auto-embed location?", auto_embed=bool(embed_location), data_location=dataset_location.parent, yaml_location=output_yaml.parent, ) with DatasetPrepare( metadata_path=output_yaml, dataset_location=dataset_location, ) as p: p.properties["odc:producer"] = producer if producer == "esa.int": jp2_offsets = _extract_esa_fields(dataset_location, p, granule_id=granule_id) elif producer == "sinergise.com": jp2_offsets = _extract_sinergise_fields(dataset_location.parent, p) else: raise NotImplementedError( f"Unknown s2 producer {producer}. Expected 'sinergise.com' or 'esa.int'" ) p.dataset_id = _get_stable_id(p) p.platform = _get_platform_name(p.properties) p.instrument = "MSI" p.constellation = "sentinel-2" # TODO: How to read collection number from metadata? (once ESA etc add one) collection_number = 0 p.dataset_version = f"{collection_number}.0.{p.processed:%Y%m%d}" p.properties["odc:file_format"] = "JPEG2000" p.product_family = "level1" for path in jp2_offsets: band_number = _extract_band_number(path.stem) if band_number.lower() in ("tci", "pvi", "preview"): continue if band_number not in SENTINEL_MSI_BAND_ALIASES: raise RuntimeError( f"Unknown band number {band_number!r} in image {path}") p.note_measurement( path=path, name=SENTINEL_MSI_BAND_ALIASES[band_number], relative_to_dataset_location=True, ) dataset_id, metadata_path = p.done(embed_location=embed_location) doc = serialise.from_doc(p.written_dataset_doc, skip_validation=True, normalise_properties=False) if not doc.locations: doc.locations = [names.resolve_location(dataset_location)] return doc, metadata_path
def validate_dataset( doc: Dict, product_definition: Optional[Dict] = None, metadata_type_definition: Optional[Dict] = None, thorough: bool = False, readable_location: Union[str, Path] = None, expect: ValidationExpectations = None, ) -> ValidationMessages: """ Validate a dataset document, optionally against the given product. By default this will only look at the metadata, run with thorough=True to open the data files too. :param product_definition: Optionally check that the dataset matches this product definition. :param thorough: Open the imagery too, to check that data types etc match. :param readable_location: Dataset location to use, if not the metadata path. :param expect: Where can we be lenient in validation? """ validation_context = {} expect = expect or ValidationExpectations() if metadata_type_definition is not None: expect = expect.with_document_overrides(metadata_type_definition) validation_context["type"] = metadata_type_definition["name"] if product_definition is not None: expect = expect.with_document_overrides(product_definition) validation_context["product"] = product_definition["name"] # noinspection PyShadowingNames def _info(code: str, reason: str, hint: str = None): return ValidationMessage(Level.info, code, reason, hint=hint, context=validation_context) # noinspection PyShadowingNames def _warning(code: str, reason: str, hint: str = None): return ValidationMessage(Level.warning, code, reason, hint=hint, context=validation_context) # noinspection PyShadowingNames def _error(code: str, reason: str, hint: str = None): return ValidationMessage(Level.error, code, reason, hint=hint, context=validation_context) schema = doc.get("$schema") if schema is None: yield _error( "no_schema", f"No $schema field. " f"You probably want an ODC dataset schema {model.ODC_DATASET_SCHEMA_URL!r}", ) return if schema != model.ODC_DATASET_SCHEMA_URL: yield _error( "unknown_doc_type", f"Unknown doc schema {schema!r}. Only ODC datasets are supported ({model.ODC_DATASET_SCHEMA_URL!r})", ) return has_doc_errors = False for error in serialise.DATASET_SCHEMA.iter_errors(doc): has_doc_errors = True displayable_path = ".".join(error.absolute_path) hint = None if displayable_path == "crs" and "not of type" in error.message: hint = "epsg codes should be prefixed with 'epsg:1234'" context = f"({displayable_path}) " if displayable_path else "" yield _error("structure", f"{context}{error.message} ", hint=hint) if has_doc_errors: return dataset = serialise.from_doc(doc, skip_validation=True) if not dataset.product.href: _info("product_href", "A url (href) is recommended for products") yield from _validate_geo(dataset, expect_geometry=expect.require_geometry) # Note that a dataset may have no measurements (eg. telemetry data). # (TODO: a stricter mode for when we know we should have geo and measurement info) if dataset.measurements: for name, measurement in dataset.measurements.items(): grid_name = measurement.grid if grid_name != "default" or dataset.grids: if grid_name not in dataset.grids: yield _error( "invalid_grid_ref", f"Measurement {name!r} refers to unknown grid {grid_name!r}", ) if is_absolute(measurement.path): yield _warning( "absolute_path", f"measurement {name!r} has an absolute path: {measurement.path!r}", ) yield from _validate_stac_properties(dataset) required_measurements: Dict[str, ExpectedMeasurement] = {} if product_definition is not None: required_measurements.update({ m.name: m for m in map( ExpectedMeasurement.from_definition, product_definition.get("measurements") or (), ) }) product_name = product_definition.get("name") if product_name != dataset.product.name: # This is only informational as it's possible products may be indexed with finer-grained # categories than the original datasets: eg. a separate "nrt" product, or test product. yield _info( "product_mismatch", f"Dataset product name {dataset.product.name!r} " f"does not match the given product ({product_name!r}", ) for name in required_measurements: if name not in dataset.measurements.keys(): yield _error( "missing_measurement", f"Product {product_name} expects a measurement {name!r})", ) measurements_not_in_product = set( dataset.measurements.keys()).difference({ m["name"] for m in product_definition.get("measurements") or () }) # Remove the measurements that are allowed to be extra. measurements_not_in_product.difference_update( expect.allow_extra_measurements or set()) if measurements_not_in_product: things = ", ".join(sorted(measurements_not_in_product)) yield _warning( "extra_measurements", f"Dataset has measurements not present in product definition for {product_name!r}: {things}", hint= "This may be valid, as it's allowed by ODC. Set `expect_extra_measurements` to mute this.", ) if metadata_type_definition: # Datacube does certain transforms on an eo3 doc before storage. # We need to do the same, as the fields will be read from the storage. prepared_doc = prep_eo3(doc) all_nullable_fields = tuple(expect.allow_nullable_fields) + tuple( expect.allow_missing_fields) for field_name, offsets in _get_field_offsets( metadata_type=metadata_type_definition): if ( # If a field is required... (field_name not in expect.allow_missing_fields) and # ... and none of its offsets are in the document not any( _has_offset(prepared_doc, offset) for offset in offsets)): # ... warn them. product_name = (product_definition.get("name") if product_definition else dataset.product.name) readable_offsets = " or ".join("->".join(offset) for offset in offsets) yield _warning( "missing_field", f"Dataset is missing field {field_name!r} " f"for type {metadata_type_definition['name']!r}", hint=f"Expected at {readable_offsets}", ) continue if field_name not in all_nullable_fields: value = None for offset in offsets: value = toolz.get_in(offset, prepared_doc) if value is None: yield _info( "null_field", f"Value is null for configured field {field_name!r}", ) dataset_location = dataset.locations[ 0] if dataset.locations else readable_location # If we have a location: # For each measurement, try to load it. # If loadable: if thorough: for name, measurement in dataset.measurements.items(): full_path = uri_resolve(dataset_location, measurement.path) expected_measurement = required_measurements.get(name) band = measurement.band or 1 with rasterio.open(full_path) as ds: ds: DatasetReader if band not in ds.indexes: yield _error( "incorrect_band", f"Measurement {name!r} file contains no rio index {band!r}.", hint=f"contains indexes {ds.indexes!r}", ) continue if not expected_measurement: # The measurement is not in the product definition # # This is only informational because a product doesn't have to define all # measurements that the datasets contain. # # This is historically because dataset documents reflect the measurements that # are stored on disk, which can differ. But products define the set of measurments # that are mandatory in every dataset. # # (datasets differ when, for example, sensors go offline, or when there's on-disk # measurements like panchromatic that GA doesn't want in their product definitions) if required_measurements: yield _info( "unspecified_measurement", f"Measurement {name} is not in the product", ) else: expected_dtype = expected_measurement.dtype band_dtype = ds.dtypes[band - 1] # TODO: NaN handling if expected_dtype != band_dtype: yield _error( "different_dtype", f"{name} dtype: " f"product {expected_dtype!r} != dataset {band_dtype!r}", ) ds_nodata = ds.nodatavals[band - 1] # If the dataset is missing 'nodata', we can allow anything in product 'nodata'. # (In ODC, nodata might be a fill value for loading data.) if ds_nodata is None: continue # Otherwise check that nodata matches. expected_nodata = expected_measurement.nodata if expected_nodata != ds_nodata and not ( _is_nan(expected_nodata) and _is_nan(ds_nodata)): yield _error( "different_nodata", f"{name} nodata: " f"product {expected_nodata !r} != dataset {ds_nodata !r}", )
def update_metadata( nci_metadata_file, s3_bucket, s3_base_url, explorer_base_url, sns_topic, s3_path ): """ Uploads updated metadata with nbar element removed, updated checksum file, STAC doc created and publish SNS message :param nci_metadata_file: Path of metadata file in NCI :param s3_bucket: Name of S3 bucket :param s3_base_url: Base URL of the S3 bucket :param explorer_base_url: Base URL of the explorer :param sns_topic: ARN of the SNS topic :param s3_path: Path in S3 :return: List of errors """ # Initialise error list metadata_error_list = [] # Initialise checksum list new_checksum_list = {} nci_metadata_file_path = Path(nci_metadata_file) temp_metadata = serialise.load_yaml(nci_metadata_file_path) # Deleting Nbar related metadata # Because Landsat 8 is different, we need to check if the fields exist # before removing them. if "nbar_blue" in temp_metadata["measurements"]: del temp_metadata["measurements"]["nbar_blue"] if "nbar_green" in temp_metadata["measurements"]: del temp_metadata["measurements"]["nbar_green"] if "nbar_nir" in temp_metadata["measurements"]: del temp_metadata["measurements"]["nbar_nir"] if "nbar_red" in temp_metadata["measurements"]: del temp_metadata["measurements"]["nbar_red"] if "nbar_swir_1" in temp_metadata["measurements"]: del temp_metadata["measurements"]["nbar_swir_1"] if "nbar_swir_2" in temp_metadata["measurements"]: del temp_metadata["measurements"]["nbar_swir_2"] if "nbar_coastal_aerosol" in temp_metadata["measurements"]: del temp_metadata["measurements"]["nbar_coastal_aerosol"] if "nbar_panchromatic" in temp_metadata["measurements"]: del temp_metadata["measurements"]["nbar_panchromatic"] if "oa_nbar_contiguity" in temp_metadata["measurements"]: del temp_metadata["measurements"]["oa_nbar_contiguity"] if "thumbnail:nbar" in temp_metadata["accessories"]: del temp_metadata["accessories"]["thumbnail:nbar"] # Format an eo3 dataset dict for human-readable yaml serialisation. temp_metadata = serialise.prepare_formatting(temp_metadata) # Dump metadata yaml into buffer with io.BytesIO() as temp_yaml: serialise.dumps_yaml(temp_yaml, temp_metadata) temp_yaml.seek( 0 ) # Seek back to the beginning of the file before next read/write new_checksum_list[nci_metadata_file_path.name] = verify.calculate_hash( temp_yaml ) # Write odc metadata yaml object into S3 s3_metadata_file = f"{s3_path}/{nci_metadata_file_path.name}" try: upload_s3_resource(s3_bucket, s3_metadata_file, temp_yaml.getvalue()) LOG.info(f"Finished uploading metadata to {s3_metadata_file}") except S3SyncException as exp: LOG.error(f"Failed uploading metadata to {s3_metadata_file} - {exp}") metadata_error_list.append( f"Failed uploading metadata to {s3_metadata_file} - {exp}" ) # Create stac metadata name = nci_metadata_file_path.stem.replace(".odc-metadata", "") stac_output_file_path = nci_metadata_file_path.with_name(f"{name}.stac-item.json") stac_url_path = f"{s3_base_url if s3_base_url else boto3.client('s3').meta.endpoint_url}/{s3_path}/" item_doc = dc_to_stac( serialise.from_doc(temp_metadata), nci_metadata_file_path, stac_output_file_path, stac_url_path, explorer_base_url, True, ) stac_dump = json.dumps(item_doc, indent=4, default=json_fallback) # Write stac json to buffer with io.BytesIO() as temp_stac: temp_stac.write(stac_dump.encode()) temp_stac.seek( 0 ) # Seek back to the beginning of the file before next read/write new_checksum_list[stac_output_file_path.name] = verify.calculate_hash(temp_stac) # Write stac metadata json object into S3 s3_stac_file = f"{s3_path}/{stac_output_file_path.name}" try: upload_s3_resource(s3_bucket, s3_stac_file, temp_stac.getvalue()) LOG.info(f"Finished uploading STAC metadata to {s3_stac_file}") except S3SyncException as exp: LOG.error(f"Failed uploading STAC metadata to {s3_stac_file} - {exp}") metadata_error_list.append( f"Failed uploading STAC metadata to {s3_stac_file} - {exp}" ) # Publish message containing STAC metadata to SNS Topic message_attributes = get_common_message_attributes(json.loads(stac_dump)) message_attributes.update( {"action": {"DataType": "String", "StringValue": "ADDED"}} ) try: publish_sns(sns_topic, stac_dump, message_attributes) LOG.info(f"Finished publishing SNS Message to SNS Topic {sns_topic}") except S3SyncException as exp: LOG.error(f"Failed publishing SNS Message to SNS Topic {sns_topic} - {exp}") metadata_error_list.append( f"Failed publishing SNS Message to SNS Topic {sns_topic} - {exp}" ) # Update checksum file checksum_filename = nci_metadata_file_path.stem.replace(".odc-metadata", "") checksum_file_path = nci_metadata_file_path.with_name(f"{checksum_filename}.sha1") try: upload_checksum( nci_metadata_file_path, checksum_file_path, new_checksum_list, s3_bucket, s3_path, ) LOG.info( f"Finished uploading checksum file " f"{s3_path}/{checksum_file_path.name}" ) except S3SyncException as exp: LOG.error( f"Failed uploading checksum file " f"{s3_path}/{checksum_file_path.name} - {exp}" ) metadata_error_list.append( f"Failed uploading checksum file " f"{s3_path}/{checksum_file_path.name} - {exp}" ) return metadata_error_list