def test_complain_about_missing_fields(tmp_path: Path, l1_ls8_folder: Path): """ It should complain immediately if I add a file without enough metadata to write the filename. (and with a friendly error message) """ out = tmp_path / "out" out.mkdir() [blue_geotiff_path] = l1_ls8_folder.rglob("L*_B2.TIF") # Default simple naming conventions need at least a date and family... with pytest.raises( ValueError, match="Need more properties to fulfill naming conventions." ): with DatasetAssembler(out) as p: p.write_measurement("blue", blue_geotiff_path) # It should mention the field that's missing (we added a date, so product_family is needed) with DatasetAssembler(out) as p: with pytest.raises(ValueError, match="odc:product_family"): p.datetime = datetime(2019, 7, 4, 13, 7, 5) p.write_measurement("blue", blue_geotiff_path) # DEA naming conventions should have stricter standards, and will tell your which fields you need to add. with DatasetAssembler(out, naming_conventions="dea") as p: # We set all the fields that work in default naming conventions. p.datetime = datetime(2019, 7, 4, 13, 7, 5) p.product_family = "quaternarius" p.processed_now() # These fields are mandatory for DEA, and so should be complained about. expected_extra_fields_needed = ( "eo:platform", "eo:instrument", "odc:dataset_version", "odc:producer", "odc:region_code", ) with pytest.raises(ValueError) as got_error: p.write_measurement("blue", blue_geotiff_path) # All needed fields should have been in the error message. for needed_field_name in expected_extra_fields_needed: assert needed_field_name in got_error.value.args[0], ( f"Expected field {needed_field_name} to " f"be listed as mandatory in the error message" )
def assert_names_match( tmp_path: Path, # Given: conventions, properties: Mapping, # Then expect: expect_metadata_path: str = None, expect_label: str = None, ): __tracebackhide__ = operator.methodcaller("errisinstance", AssertionError) """ Easily test a set of naming conventions: Do certain properties lead to expected file names? """ with DatasetAssembler(tmp_path, naming_conventions=conventions) as p: p.properties.update(properties) dataset_id, metadata_path = p.done() if expect_metadata_path: metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert metadata_path_offset == expect_metadata_path with metadata_path.open("r") as f: doc = yaml.YAML(typ="safe").load(f) if expect_label: assert doc["label"] == expect_label, "Unexpected dataset label"
def test_dea_interim_folder_calculation(tmp_path: Path): """ DEA Naming conventions should include maturity in the folder name when it's not a 'final' dataset. """ with DatasetAssembler(tmp_path, naming_conventions="dea") as p: p.platform = "landsat-7" # Should not end up in the path, as it's the default: p.product_maturity = "stable" p.instrument = "ETM+" p.datetime = datetime(1998, 7, 30) p.product_family = "frogs" p.processed = "1999-11-20 00:00:53.152462Z" p.maturity = "interim" p.producer = "ga.gov.au" p.properties["landsat:landsat_scene_id"] = "LE70930821999324EDC00" p.dataset_version = "1.2.3" p.region_code = "093082" p.done() [metadata_path] = tmp_path.rglob("*.odc-metadata.yaml") calculated_path: Path = metadata_path.relative_to(tmp_path) assert calculated_path == Path( # ⇩⇩⇩⇩⇩⇩⇩⇩ Adds interim flag "ga_ls7e_frogs_1/093/082/1998/07/30_interim/ga_ls7e_frogs_1-2-3_093082_1998-07-30_interim.odc-metadata.yaml" )
def test_dea_c3_naming_conventions(tmp_path: Path): """ A sample scene for Alchemist C3 processing that tests the naming conventions. """ p = DatasetAssembler(tmp_path, naming_conventions="dea_c3") p.platform = "landsat-7" p.datetime = datetime(1998, 7, 30) p.product_family = "wo" p.processed = "1998-07-30T12:23:23" p.maturity = "interim" p.producer = "ga.gov.au" p.region_code = "090081" # Try missing few fields and expect ValueError with pytest.raises( ValueError, match="Need more properties to fulfill naming conventions."): p.done() # Put back the missed ones p.dataset_version = "1.6.0" p.collection_number = "3" # Collection number returned as integer via the getter. assert p.collection_number == 3 # Success case dataset_id, metadata_path = p.done() metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert ( metadata_path_offset == "ga_ls_wo_3/1-6-0/090/081/1998/07/30/ga_ls_wo_3_090081_1998-07-30_interim.odc-metadata.yaml" )
def test_add_source_dataset(tmp_path: Path, inherit_geom): from eodatasets3 import serialise p = DatasetAssembler(tmp_path, naming_conventions="dea_c3") source_dataset = serialise.from_path( Path(__file__).parent / "data/LC08_L1TP_089080_20160302_20170328_01_T1.yaml") p.add_source_dataset(source_dataset, auto_inherit_properties=True, inherit_geometry=inherit_geom) p.maturity = "interim" p.collection_number = "3" p.dataset_version = "1.6.0" p.producer = "ga.gov.au" p.processed = "1998-07-30T12:23:23" p.product_family = "wofs" p.write_measurement( "water", Path(__file__).parent / "data/wofs/ga_ls_wofs_3_099081_2020-07-26_interim_water_clipped.tif", ) id, path = p.done() output = serialise.from_path(path) if inherit_geom: # POLYGON((609615 - 3077085, 378285 - 3077085, 378285 - 3310515, 609615 - 3310515, 609615 - 3077085)) assert output.geometry == source_dataset.geometry else: # POLYGON((684285 - 3439275, 684285 - 3444495, 689925 - 3444495, 689925 - 3439275, 684285 - 3439275)) # Geometry is not set from the source dataset, but instead from the added wofs measurement assert output.geometry != source_dataset.geometry
def test_africa_naming_conventions(tmp_path: Path): """ Minimal fields needed for DEAfrica naming conventions """ with DatasetAssembler(tmp_path, naming_conventions="deafrica") as p: # Just the fields listed in required_fields. p.producer = "digitalearthafrica.org" p.datetime = datetime(1998, 7, 30) p.region_code = "090081" p.product_family = "wofs" p.platform = "LANDSAT_8" p.processed_now() p.dataset_version = "0.1.2" dataset_id, metadata_path = p.done() metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert ( metadata_path_offset == "wofs_ls/0-1-2/090/081/1998/07/30/wofs_ls_090081_1998-07-30.odc-metadata.yaml" ) with DatasetAssembler(tmp_path, naming_conventions="deafrica") as p: # Just the fields listed in required_fields. p.producer = "digitalearthafrica.org" p.datetime = datetime(1998, 7, 30) p.region_code = "090081" p.product_family = "fc" p.platform = "LANDSAT_8" p.processed_now() p.dataset_version = "0.1.2" dataset_id, metadata_path = p.done() metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert ( metadata_path_offset == "fc_ls/0-1-2/090/081/1998/07/30/fc_ls_090081_1998-07-30.odc-metadata.yaml" )
def test_dataset_multi_platform(tmp_path: Path): """Can we make a dataset derived from multiple platforms?""" # No platform is included in names when there's a mix. with DatasetAssembler(tmp_path) as p: p.platforms = ["Sentinel_2a", "landsat_7"] assert p.platform == "landsat-7,sentinel-2a" p.datetime = datetime(2019, 1, 1) p.product_family = "peanuts" p.processed_now() dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.YAML(typ="safe").load(f) assert doc["label"] == "peanuts_2019-01-01" metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert (metadata_path_offset == "peanuts/2019/01/01/peanuts_2019-01-01.odc-metadata.yaml") # ... but show the platform abbreviation when there's a known group. with DatasetAssembler(tmp_path) as p: p.platforms = ["Sentinel_2a", "sentinel_2b"] assert p.platform == "sentinel-2a,sentinel-2b" p.datetime = datetime(2019, 1, 1) p.product_family = "peanuts" p.processed_now() dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.YAML(typ="safe").load(f) assert doc["label"] == "s2_peanuts_2019-01-01" metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert (metadata_path_offset == "s2_peanuts/2019/01/01/s2_peanuts_2019-01-01.odc-metadata.yaml")
def test_minimal_generated_naming_package(tmp_path: Path, l1_ls8_folder: Path): """ What's the minimum number of fields we can set and still generate file/product names to produce a package? """ out = tmp_path / "out" out.mkdir() [blue_geotiff_path] = l1_ls8_folder.rglob("L*_B2.TIF") with DatasetAssembler(out) as p: p.datetime = datetime(2019, 7, 4, 13, 7, 5) p.product_family = "quaternarius" p.processed_now() p.write_measurement("blue", blue_geotiff_path) # A friendly __str__ for notebook/terminal users: assert str(p) == dedent( f""" Assembling quaternarius (unfinished) - 1 measurements: blue - 4 properties: datetime, odc:file_format, odc:processing_datetime, odc:prod... Writing to location: {out}/quaternarius/2019/07/04/quaternarius_2019-07-04.odc-metadata.yaml """ ) # p.done() will validate the dataset and write it to the destination atomically. dataset_id, metadata_path = p.done() assert dataset_id is not None assert_file_structure( out, { "quaternarius": { "2019": { "07": { "04": { # Set a dataset version to get rid of 'beta' label. "quaternarius_2019-07-04.odc-metadata.yaml": "", "quaternarius_2019-07-04.proc-info.yaml": "", "quaternarius_2019-07-04_blue.tif": "", "quaternarius_2019-07-04.sha1": "", } } } } }, )
def test_dataset_no_measurements(tmp_path: Path): """Can we make a dataset with no measurements? (eg. telemetry data)""" with DatasetAssembler(tmp_path) as p: # A custom label too. p.label = "chipmonk_sightings_2019" p.datetime = datetime(2019, 1, 1) p.product_family = "chipmonk_sightings" p.processed_now() dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.load(f) assert doc["label"] == "chipmonk_sightings_2019", "Couldn't override label field"
def test_minimal_s2_dataset(tmp_path: Path): """A minimal dataset with sentinel platform/instrument""" with DatasetAssembler(tmp_path) as p: # A custom label too. p.platform = "sentinel-2a" p.instrument = "msi" p.datetime = datetime(2018, 11, 4) p.product_family = "blueberries" p.processed = "2018-11-05T12:23:23" dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.load(f) assert doc["label"] == "s2am_blueberries_2018-11-04", "Unexpected dataset label"
def test_custom_naming(tmp_path: Path): """ We can create naming conventions separately, and later give it to assembler. """ p = _basic_properties_set() convention = namer(properties=p) convention.dataset_folder = Path("my/custom/folder/") with DatasetAssembler(tmp_path, names=convention) as a: dataset_id, metadata_path = a.done() metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert ( metadata_path_offset == "my/custom/folder/ga_s2am_tester_1-2-3_023543_2013-02-03.odc-metadata.yaml" )
def test_minimal_s1_dataset(tmp_path: Path): """A minimal dataset with sentinel-1a/b platform/instrument""" with DatasetAssembler(tmp_path) as p: # A custom label too. p.platform = "sentinel-1a" p.instrument = "c-sar" p.datetime = datetime(2018, 11, 4) p.product_family = "bck" p.processed = "2018-11-05T12:23:23" dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.safe_load(f) assert doc["label"] == "s1ac_bck_2018-11-04", "Unexpected dataset label"
def prepare_and_write( ds_path: Path, output_yaml: Path, producer: str, ) -> Tuple[uuid.UUID, Path]: with DatasetAssembler( metadata_path=output_yaml, dataset_location=ds_path, ) as p: p.properties["odc:producer"] = producer if producer == "esa.int": jp2_offsets = _extract_esa_fields(ds_path, p) elif producer == "sinergise.com": jp2_offsets = _extract_sinergise_fields(ds_path, p) else: raise NotImplementedError( f"Unknown s2 producer {producer}. Expected 'sinergise.com' or 'esa.int'" ) p.dataset_id = _get_stable_id(p) p.properties["eo:platform"] = _get_platform_name(p.properties) p.properties["eo:instrument"] = "MSI" p.properties["odc:dataset_version"] = f"1.0.{p.processed:%Y%m%d}" p.properties["odc:file_format"] = "JPEG2000" p.properties["odc:product_family"] = "level1" for path in jp2_offsets: band_number = _extract_band_number(path.stem) if band_number.lower() in ("tci", "pvi", "preview"): continue if band_number not in SENTINEL_MSI_BAND_ALIASES: raise RuntimeError( f"Unknown band number {band_number!r} in image {path}") p.note_measurement( path=path, name=SENTINEL_MSI_BAND_ALIASES[band_number], relative_to_dataset_location=True, ) return p.done()
def test_dataset_given_properties(tmp_path: Path): """Can we give existing properties to the assembler?""" properties = { "datetime": datetime(2019, 1, 1), "odc:product_family": "chipmonk_sightings", "odc:processing_datetime": "2021-06-15T01:33:43.378850", } names = namer(properties=properties) with DatasetAssembler(tmp_path, names=names) as p: # It should have normalised properties! assert p.processed == datetime(2021, 6, 15, 1, 33, 43, 378850, timezone.utc) dataset_id, metadata_path = p.done() relative_path = metadata_path.relative_to(tmp_path) assert relative_path == Path( "chipmonk_sightings/2019/01/01/chipmonk_sightings_2019-01-01.odc-metadata.yaml" )
def test_minimal_s2_dataset_normal(tmp_path: Path): """A minimal dataset with sentinel platform/instrument""" with DatasetAssembler(tmp_path) as p: p.platform = "sentinel-2a" p.instrument = "msi" p.datetime = datetime(2018, 11, 4) p.product_family = "blueberries" p.processed = "2018-11-05T12:23:23" p.properties[ "sentinel:sentinel_tile_id"] = "S2A_OPER_MSI_L1C_TL_SGS__20170822T015626_A011310_T54KYU_N02.05" dataset_id, metadata_path = p.done() with metadata_path.open("r") as f: doc = yaml.YAML(typ="safe").load(f) metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert metadata_path_offset == ( "s2am_blueberries/2018/11/04/s2am_blueberries_2018-11-04.odc-metadata.yaml" ) assert doc[ "label"] == "s2am_blueberries_2018-11-04", "Unexpected dataset label"
def test_minimal_package_with_product_name(tmp_path: Path, l1_ls8_folder: Path): """ You can specify an ODC product name manually to avoid most of the name generation. """ out = tmp_path / "out" out.mkdir() [blue_geotiff_path] = l1_ls8_folder.rglob("L*_B2.TIF") with DatasetAssembler(out) as p: p.datetime = datetime(2019, 7, 4, 13, 7, 5) p.product_name = "loch_ness_sightings" p.processed = datetime(2019, 7, 4, 13, 8, 7) p.write_measurement("blue", blue_geotiff_path) dataset_id, metadata_path = p.done() assert dataset_id is not None assert_file_structure( out, { "loch_ness_sightings": { "2019": { "07": { "04": { # Set a dataset version to get rid of 'beta' label. "loch_ness_sightings_2019-07-04.odc-metadata.yaml": "", "loch_ness_sightings_2019-07-04.proc-info.yaml": "", "loch_ness_sightings_2019-07-04_blue.tif": "", "loch_ness_sightings_2019-07-04.sha1": "", } } } } }, )
def prepare_and_write( ds_path: Path, output_yaml_path: Path, source_telemetry: Path = None, # TODO: Can we infer producer automatically? This is bound to cause mistakes othewise producer="usgs.gov", ) -> Tuple[uuid.UUID, Path]: """ Prepare an eo3 metadata file for a Level1 dataset. Input dataset path can be a folder or a tar file. """ mtl_doc, mtl_filename = get_mtl_content(ds_path) if not mtl_doc: raise ValueError(f"No MTL file found for {ds_path}") usgs_collection_number = mtl_doc["metadata_file_info"].get( "collection_number") if usgs_collection_number is None: raise NotImplementedError( "Dataset has no collection number: pre-collection data is not supported." ) data_format = mtl_doc["product_metadata"]["output_format"] if data_format.upper() != "GEOTIFF": raise NotImplementedError( f"Only GTiff currently supported, got {data_format}") file_format = FileFormat.GeoTIFF # Assumed below. projection_params = mtl_doc["projection_parameters"] if ("grid_cell_size_thermal" in projection_params and "grid_cell_size_reflective" in projection_params and (projection_params["grid_cell_size_reflective"] != projection_params["grid_cell_size_thermal"])): raise NotImplementedError( "reflective and thermal have different cell sizes") ground_sample_distance = min(value for name, value in projection_params.items() if name.startswith("grid_cell_size_")) with DatasetAssembler( metadata_path=output_yaml_path, dataset_location=ds_path, # Detministic ID based on USGS's product id (which changes when the scene is reprocessed by them) dataset_id=uuid.uuid5( USGS_UUID_NAMESPACE, mtl_doc["metadata_file_info"]["landsat_product_id"]), naming_conventions="dea", if_exists=IfExists.Overwrite, ) as p: if source_telemetry: # Only GA's data has source telemetry... assert producer == "ga.gov.au" p.add_source_path(source_telemetry) p.platform = mtl_doc["product_metadata"]["spacecraft_id"] p.instrument = mtl_doc["product_metadata"]["sensor_id"] p.product_family = "level1" p.producer = producer p.datetime = "{}T{}".format( mtl_doc["product_metadata"]["date_acquired"], mtl_doc["product_metadata"]["scene_center_time"], ) p.processed = mtl_doc["metadata_file_info"]["file_date"] p.properties["odc:file_format"] = file_format p.properties["eo:gsd"] = ground_sample_distance cloud_cover = mtl_doc["image_attributes"]["cloud_cover"] # Cloud cover is -1 when missing (such as TIRS-only data) if cloud_cover != -1: p.properties["eo:cloud_cover"] = cloud_cover p.properties["eo:sun_azimuth"] = mtl_doc["image_attributes"][ "sun_azimuth"] p.properties["eo:sun_elevation"] = mtl_doc["image_attributes"][ "sun_elevation"] p.properties["landsat:collection_number"] = usgs_collection_number for section, fields in _COPYABLE_MTL_FIELDS: for field in fields: value = mtl_doc[section].get(field) if value is not None: p.properties[f"landsat:{field}"] = value p.region_code = f"{p.properties['landsat:wrs_path']:03d}{p.properties['landsat:wrs_row']:03d}" org_collection_number = utils.get_collection_number( p.producer, p.properties["landsat:collection_number"]) p.dataset_version = f"{org_collection_number}.0.{p.processed:%Y%m%d}" # NRT product? # Category is one of: T1, T2 or RT ('real time') if p.properties["landsat:collection_category"] == "RT": p.properties["odc:dataset_maturity"] = "nrt" band_aliases = get_band_alias_mappings(p.platform, p.instrument) for usgs_band_id, file_location in _iter_bands_paths(mtl_doc): p.note_measurement( band_aliases[usgs_band_id], file_location, relative_to_dataset_location=True, ) p.add_accessory_file("metadata:landsat_mtl", Path(mtl_filename)) return p.done()
def prepare_and_write( ds_path: Path, collection_location: Path, # TODO: Can we infer producer automatically? This is bound to cause mistakes othewise producer="usgs.gov", ) -> Tuple[uuid.UUID, Path]: """ Prepare an eo3 metadata file for a Level2 dataset. Input dataset path can be a folder or a tar file. """ mtl_doc, mtl_filename = get_mtl_content( ds_path, root_element="landsat_metadata_file") if not mtl_doc: raise ValueError(f"No MTL file found for {ds_path}") usgs_collection_number = mtl_doc["product_contents"].get( "collection_number") if usgs_collection_number is None: raise NotImplementedError( "Dataset has no collection number: pre-collection data is not supported." ) data_format = mtl_doc["product_contents"]["output_format"] if data_format.upper() != "GEOTIFF": raise NotImplementedError( f"Only GTiff currently supported, got {data_format}") file_format = FileFormat.GeoTIFF # Assumed below. if (mtl_doc["projection_attributes"]["grid_cell_size_reflective"] != mtl_doc["projection_attributes"]["grid_cell_size_thermal"]): raise NotImplementedError( "reflective and thermal have different cell sizes") ground_sample_distance = min( value for name, value in mtl_doc["projection_attributes"].items() if name.startswith("grid_cell_size_")) with DatasetAssembler( collection_location=collection_location, # Detministic ID based on USGS's product id (which changes when the scene is reprocessed by them) dataset_id=uuid.uuid5( USGS_UUID_NAMESPACE, mtl_doc["product_contents"]["landsat_product_id"]), naming_conventions="dea", if_exists=IfExists.Overwrite, ) as p: p.platform = mtl_doc["image_attributes"]["spacecraft_id"] p.instrument = mtl_doc["image_attributes"]["sensor_id"] p.product_family = "level2" p.producer = producer p.datetime = "{}T{}".format( mtl_doc["image_attributes"]["date_acquired"], mtl_doc["image_attributes"]["scene_center_time"], ) # p.processed = mtl_doc["metadata_file_info"]["file_date"] p.processed = mtl_doc['level2_processing_record'][ 'date_product_generated'] p.properties["odc:file_format"] = file_format p.properties["eo:gsd"] = ground_sample_distance p.properties["eo:cloud_cover"] = mtl_doc["image_attributes"][ "cloud_cover"] p.properties["eo:sun_azimuth"] = mtl_doc["image_attributes"][ "sun_azimuth"] p.properties["eo:sun_elevation"] = mtl_doc["image_attributes"][ "sun_elevation"] p.properties["landsat:collection_number"] = usgs_collection_number for section, fields in _COPYABLE_MTL_FIELDS: for field in fields: value = mtl_doc[section].get(field) if value is not None: p.properties[f"landsat:{field}"] = value p.region_code = f"{p.properties['landsat:wrs_path']:03d}{p.properties['landsat:wrs_row']:03d}" org_collection_number = utils.get_collection_number( p.producer, p.properties["landsat:collection_number"]) p.dataset_version = f"{org_collection_number}.0.{p.processed:%Y%m%d}" band_aliases = get_band_alias_mappings(p.platform, p.instrument) bands = list(_iter_bands_paths(mtl_doc)) # add to do one band - remove this to do all the bands # bands = bands[0:1] for usgs_band_id, file_location in bands: # p.note_measurement( # band_aliases[usgs_band_id], # file_location, # relative_to_dataset_location=True, # ) path_file = os.path.join(ds_path, file_location) p.write_measurement(band_aliases[usgs_band_id], path_file) p.add_accessory_file("metadata:landsat_mtl", Path(mtl_filename)) return p.done()
def package( out_directory: Path, granule: Granule, included_products: Iterable[str] = DEFAULT_PRODUCTS, include_oa: bool = True, oa_resolution: Optional[Tuple[float, float]] = None, ) -> Tuple[UUID, Path]: """ Package an L2 product. :param include_oa: :param out_directory: The base directory for output datasets. A DEA-naming-conventions folder hierarchy will be created inside this folder. :param granule: Granule information. You probably want to make one with Granule.from_path() :param included_products: A list of imagery products to include in the package. Defaults to all products. :return: The dataset UUID and output metadata path """ included_products = tuple(s.lower() for s in included_products) with h5py.File(granule.wagl_hdf5, "r") as fid: granule_group = fid[granule.name] wagl_doc = _read_wagl_metadata(granule_group) with DatasetAssembler( out_directory.absolute(), # WAGL stamps a good, random ID already. dataset_id=granule.wagl_metadata.get("id"), naming_conventions="dea_s2" if ("sentinel" in wagl_doc["source_datasets"]["platform_id"].lower()) else "dea", ) as p: _apply_wagl_metadata(p, wagl_doc) # It's a GA ARD product. p.producer = "ga.gov.au" p.product_family = "ard" p.maturity = _determine_maturity( acq_date=p.datetime, processed=p.processed, wagl_doc=wagl_doc, ) if granule.source_level1_metadata is not None: # For historical consistency: we want to use the instrument that the source L1 product # came from, not the instruments reported from the WAGL doc. # # Eg. # Level 1 will say "OLI_TIRS", while wagl doc will say "OLI". # Our current C3 products say "OLI_TIRS" so we need to stay consistent. # (even though WAGL only *used* the OLI bands, it came from an OLI_TIRS observation) # # So delete our current wagl one, since we're adding a source dataset: if p.instrument is not None: del p.properties["eo:instrument"] p.add_source_dataset(granule.source_level1_metadata, auto_inherit_properties=True) # When level 1 is NRT, ARD is always NRT. if granule.source_level1_metadata.maturity == "nrt": p.maturity = "nrt" org_collection_number = utils.get_collection_number( p.platform, p.producer, p.properties.get("landsat:collection_number")) p.dataset_version = f"{org_collection_number}.2.1" p.region_code = _extract_reference_code(p, granule.name) _read_gqa_doc(p, granule.gqa_doc) _read_fmask_doc(p, granule.fmask_doc) if granule.tesp_doc: _take_software_versions(p, granule.tesp_doc) _unpack_products(p, included_products, granule_group) if include_oa: with do("Starting OA", heading=True): _unpack_observation_attributes( p, included_products, granule_group, infer_datetime_range=p.platform.startswith("landsat"), oa_resolution=oa_resolution, ) if granule.fmask_image: with do(f"Writing fmask from {granule.fmask_image} "): p.write_measurement( "oa:fmask", granule.fmask_image, expand_valid_data=False, overview_resampling=Resampling.mode, ) with do("Finishing package"): return p.done()
def package( out_directory: Path, granule: Granule, included_products: Iterable[str] = DEFAULT_PRODUCTS, include_oa: bool = True, ) -> Tuple[UUID, Path]: """ Package an L2 product. :param include_oa: :param out_directory: The base directory for output datasets. A DEA-naming-conventions folder hierarchy will be created inside this folder. :param granule: Granule information. You probably want to make one with Granule.from_path() :param included_products: A list of imagery products to include in the package. Defaults to all products. :return: The dataset UUID and output metadata path """ included_products = tuple(s.lower() for s in included_products) with h5py.File(granule.wagl_hdf5, "r") as fid: granule_group = fid[granule.name] with DatasetAssembler( out_directory, # WAGL stamps a good, random ID already. dataset_id=granule.wagl_metadata.get("id"), naming_conventions="dea", ) as p: level1 = granule.source_level1_metadata p.add_source_dataset(level1, auto_inherit_properties=True) # It's a GA ARD product. p.producer = "ga.gov.au" p.product_family = "ard" org_collection_number = utils.get_collection_number( p.producer, p.properties["landsat:collection_number"]) # TODO: wagl's algorithm version should determine our dataset version number, right? p.dataset_version = f"{org_collection_number}.0.0" p.region_code = _extract_reference_code(p, granule.name) _read_wagl_metadata(p, granule_group) _read_gqa_doc(p, granule.gqa_doc) _read_fmask_doc(p, granule.fmask_doc) _unpack_products(p, included_products, granule_group) if include_oa: with do(f"Starting OA", heading=True): _unpack_observation_attributes( p, included_products, granule_group, infer_datetime_range=level1.platform.startswith( "landsat"), ) if granule.fmask_image: with do(f"Writing fmask from {granule.fmask_image} "): p.write_measurement( "oa:fmask", granule.fmask_image, expand_valid_data=False, overview_resampling=Resampling.mode, ) with do("Finishing package"): return p.done()
def package_non_standard(outdir, granule): """ yaml creator for the ard pipeline. """ outdir = Path(outdir) / granule.name indir = granule.wagl_hdf5.parent if indir.is_file(): shutil.copy(indir, outdir) else: shutil.copytree(indir, outdir) wagl_h5 = outdir / str(granule.name + ".wagl.h5") dataset_doc = outdir / str(granule.name + ".yaml") boolean_h5 = Path(str(wagl_h5).replace("wagl.h5", "converted.datasets.h5")) fmask_img = outdir / str(granule.name + ".fmask.img") f = h5py.File(boolean_h5) with DatasetAssembler(metadata_path=dataset_doc, naming_conventions="dea") as da: level1 = granule.source_level1_metadata da.add_source_dataset(level1, auto_inherit_properties=True, inherit_geometry=True) da.product_family = "ard" da.producer = "ga.gov.au" da.properties["odc:file_format"] = "HDF5" with h5py.File(wagl_h5, "r") as fid: img_paths = [ppjoin(fid.name, pth) for pth in find(fid, "IMAGE")] granule_group = fid[granule.name] try: wagl_path, *ancil_paths = [ pth for pth in find(granule_group, "SCALAR") if "METADATA" in pth ] except ValueError: raise ValueError("No nbar metadata found in granule") [wagl_doc] = loads_yaml(granule_group[wagl_path][()]) da.processed = get_path(wagl_doc, ("system_information", "time_processed")) platform = da.properties["eo:platform"] if platform == "sentinel-2a" or platform == "sentinel-2b": org_collection_number = 3 else: org_collection_number = utils.get_collection_number( platform, da.producer, da.properties["landsat:collection_number"]) da.dataset_version = f"{org_collection_number}.1.0" da.region_code = eodatasets3.wagl._extract_reference_code( da, granule.name) eodatasets3.wagl._read_gqa_doc(da, granule.gqa_doc) eodatasets3.wagl._read_fmask_doc(da, granule.fmask_doc) with rasterio.open(fmask_img) as ds: fmask_layer = "/{}/OA_FMASK/oa_fmask".format(granule.name) data = ds.read(1) fmask_ds = f.create_dataset(fmask_layer, data=data, compression="lzf", shuffle=True) fmask_ds.attrs["crs_wkt"] = ds.crs.wkt fmask_ds.attrs["geotransform"] = ds.transform.to_gdal() fmask_ds.attrs[ "description"] = "Converted from ERDAS Imagine format to HDF5 to work with the limitations of varied formats within ODC" # noqa E501 grid_spec = images.GridSpec( shape=ds.shape, transform=ds.transform, crs=CRS.from_wkt(fmask_ds.attrs["crs_wkt"]), ) measurement_name = "oa_fmask" pathname = str(outdir.joinpath(boolean_h5)) no_data = fmask_ds.attrs.get("no_data_value") if no_data is None: no_data = float("nan") da._measurements.record_image( measurement_name, grid_spec, pathname, fmask_ds[:], layer="/{}".format(fmask_layer), nodata=no_data, expand_valid_data=False, ) for pathname in img_paths: ds = fid[pathname] ds_path = Path(ds.name) # eodatasets internally uses this grid spec to group image datasets grid_spec = images.GridSpec( shape=ds.shape, transform=Affine.from_gdal(*ds.attrs["geotransform"]), crs=CRS.from_wkt(ds.attrs["crs_wkt"]), ) # product group name; lambertian, nbar, nbart, oa if "STANDARDISED-PRODUCTS" in str(ds_path): product_group = ds_path.parent.name elif "INTERPOLATED-ATMOSPHERIC-COEFFICIENTS" in str(ds_path): product_group = "oa_{}".format(ds_path.parent.name) else: product_group = "oa" # spatial resolution group # used to separate measurements with the same name resolution_group = "rg{}".format( ds_path.parts[2].split("-")[-1]) measurement_name = ("_".join([ resolution_group, product_group, ds.attrs.get("alias", ds_path.name), ]).replace("-", "_").lower()) # we don't wan't hyphens in odc land # include this band in defining the valid data bounds? include = True if "nbart" in measurement_name else False no_data = ds.attrs.get("no_data_value") if no_data is None: no_data = float("nan") # if we are of type bool, we'll have to convert just for GDAL if ds.dtype.name == "bool": pathname = str(outdir.joinpath(boolean_h5)) out_ds = f.create_dataset( measurement_name, data=np.uint8(ds[:]), compression="lzf", shuffle=True, chunks=ds.chunks, ) for k, v in ds.attrs.items(): out_ds.attrs[k] = v da._measurements.record_image( measurement_name, grid_spec, pathname, out_ds[:], layer="/{}".format(out_ds.name), nodata=no_data, expand_valid_data=include, ) else: pathname = str(outdir.joinpath(wagl_h5)) # work around as note_measurement doesn't allow us to specify the gridspec da._measurements.record_image( measurement_name, grid_spec, pathname, ds[:], layer="/{}".format(ds.name), nodata=no_data, expand_valid_data=include, ) # the longest part here is generating the valid data bounds vector # landsat 7 post SLC-OFF can take a really long time return da.done()
def prepare_and_write( dataset: Path, dataset_document: Path, ) -> Tuple[uuid.UUID, Path]: # Process esa dataset if dataset.suffix == ".zip": with zipfile.ZipFile(dataset, "r") as z: # Get file paths for esa metadata files mtd_ds_zip_path = [s for s in z.namelist() if "MTD_DS.xml" in s][0] mtd_tl_zip_path = [s for s in z.namelist() if "MTD_TL.xml" in s][0] mtd_msil1c_zip_path = [ s for s in z.namelist() if "MTD_MSIL1C.xml" in s ][0] # Crawl through metadata files and return a dict of useful information mtd_ds = process_mtd_ds(z.read(mtd_ds_zip_path).decode("utf-8")) mtd_tl = process_mtd_tl(z.read(mtd_tl_zip_path).decode("utf-8")) mtd_msil1c = process_mtd_msil1c( z.read(mtd_msil1c_zip_path).decode("utf-8")) with DatasetAssembler( metadata_path=dataset_document, dataset_location=dataset, ) as p: p.properties["eo:instrument"] = "MSI" p.properties["odc:producer"] = "esa.int" p.properties["odc:product_family"] = "level1" p.properties["odc:file_format"] = "JPEG2000" p.properties.update(mtd_ds) p.properties.update(mtd_tl) p.properties.update(mtd_msil1c) p.properties[ "odc:dataset_version"] = f"1.0.{p.processed:%Y%m%d}" for file in z.namelist(): # T55HFA_20201011T000249_B01.jp2 if ".jp2" in file and "TCI" not in file and "PVI" not in file: # path = 'zip:%s!%s' % (str(dataset), str(file)) p.note_measurement( path=file, name=(SENTINEL_MSI_BAND_ALIASES[( file.split("_")[len(file.split("_")) - 1].replace(".jp2", "").replace( "B", ""))]), relative_to_dataset_location=True # path=path, name=name ) p.add_accessory_file("metadata:mtd_ds", mtd_ds_zip_path) p.add_accessory_file("metadata:mtd_tl", mtd_tl_zip_path) p.add_accessory_file("metadata:mtd_msil1c", mtd_msil1c_zip_path) return p.done() # process sinergise dataset elif dataset.is_dir(): # Get file paths for sinergise metadata files product_info_path = dataset / "productInfo.json" metadata_xml_path = dataset / "metadata.xml" if not product_info_path.exists(): raise ValueError( "No productInfo.json file found. " "Are you sure the input is a sinergise dataset folder?") # Crawl through metadata files and return a dict of useful information product_info = process_product_info(product_info_path) metadata_xml = process_metadata_xml(metadata_xml_path) with DatasetAssembler( metadata_path=dataset_document, dataset_location=dataset, ) as p: p.properties["eo:platform"] = "sentinel-2a" p.properties["eo:instrument"] = "MSI" p.properties["odc:file_format"] = "JPEG2000" p.properties["odc:product_family"] = "level1" p.properties["odc:producer"] = "sinergise.com" p.properties.update(metadata_xml) p.properties.update(product_info) p.properties["odc:dataset_version"] = f"1.0.{p.processed:%Y%m%d}" for path in dataset.rglob("*.jp2"): if "preview" not in path.stem and "TCI" not in path.stem: p.note_measurement( path=path, name=SENTINEL_MSI_BAND_ALIASES[path.stem.replace( "B", "")], ) p.add_accessory_file("metadata:product_info", product_info_path) p.add_accessory_file("metadata:sinergise_metadata", metadata_xml_path) return p.done() else: raise NotImplementedError("Unknown input file type?")
def create_eo3(granule_dir, granule_id): """ Creates an eo3 document. :param granule_dir (Path): the granule directory :return: DatasetDoc of eo3 metadata """ with open(granule_dir / "ARD-METADATA.yaml") as fin: metadata = yaml.safe_load(fin) try: coords = metadata['grid_spatial']['projection']['valid_data']['coordinates'] expand_valid_data = False except KeyError: expand_valid_data = True assembler = DatasetAssembler( dataset_location=granule_dir, metadata_path=granule_dir / "dummy", ) if "S2A" in str(granule_dir): assembler.product_family = "s2a_ard_granule" platform = "SENTINEL_2A" else: assembler.product_family = "s2b_ard_granule" platform = "SENTINEL_2B" assembler.processed_now() add_datetime(assembler, granule_dir) add_to_eo3(assembler, granule_dir, "NBART", lambda x: code_to_band[x.split('_')[-1]], expand_valid_data) add_to_eo3(assembler, granule_dir, "SUPPLEMENTARY", lambda x: x[3:].lower(), expand_valid_data) add_to_eo3(assembler, granule_dir, "QA", lambda x: x[3:].lower().replace('combined_', ''), expand_valid_data) crs, grid_docs, measurement_docs = assembler._measurements.as_geo_docs() valid_data = assembler._measurements.consume_and_get_valid_data() assembler.properties["odc:region_code"] = metadata["provider"]["reference_code"] assembler.properties["gqa:cep90"] = metadata["gqa"]["residual"]["cep90"] assembler.properties["gqa:error_message"] = metadata["gqa"]["error_message"] assembler.properties["gqa:final_gcp_count"] =metadata["gqa"]["final_gcp_count"] assembler.properties["gqa:ref_source"] = metadata["gqa"]["ref_source"] assembler.properties["sentinel:datatake_start_datetime"] = granule_id.split("_")[-4] assembler.properties["eo:platform"] = platform assembler.properties["eo:instrument"] = "MSI" for key in ["abs_iterative_mean", "abs", "iterative_mean", "iterative_stddev", "mean", "stddev"]: assembler.properties[f"gqa:{key}_xy"] = metadata["gqa"]["residual"][key]["xy"] eo3 = DatasetDoc( id=assembler.dataset_id, label=assembler.label, product=ProductDoc( name=assembler.names.product_name, href=assembler.names.product_uri ), crs=assembler._crs_str(crs) if crs is not None else None, geometry=valid_data, grids=grid_docs, properties=assembler.properties, accessories={ name: AccessoryDoc(path, name=name) for name, path in assembler._accessories.items() }, measurements=measurement_docs, lineage=assembler._lineage, ) if not expand_valid_data: eo3.geometry = Polygon(coords[0]) for measurement in eo3.measurements.values(): if measurement.grid is None: measurement.grid = 'default' return eo3
def package( out_directory: Path, granule: Granule, *, product_maturity: ProductMaturity = ProductMaturity.stable, included_products: Iterable[str] = DEFAULT_PRODUCTS, include_oa: bool = True, oa_resolution: Optional[Tuple[float, float]] = None, contiguity_resolution: Optional[Tuple[float, float]] = None, ) -> Tuple[UUID, Path]: """ Package an L2 product. :param include_oa: :param out_directory: The base directory for output datasets. A DEA-naming-conventions folder hierarchy will be created inside this folder. :param granule: Granule information. You probably want to make one with Granule.from_path() :param included_products: A list of imagery products to include in the package. Defaults to all products. :return: The dataset UUID and output metadata path """ included_products = tuple(s.lower() for s in included_products) with h5py.File(granule.wagl_hdf5, "r") as fid: granule_group = fid[granule.name] wagl_doc = _read_wagl_metadata(granule_group) with DatasetAssembler( out_directory.absolute(), # WAGL stamps a good, random ID already. dataset_id=granule.wagl_metadata.get("id"), naming_conventions="dea_s2" if ("sentinel" in wagl_doc["source_datasets"]["platform_id"].lower()) else "dea", ) as p: _apply_wagl_metadata(p, wagl_doc) # It's a GA ARD product. p.producer = "ga.gov.au" p.product_family = "ard" p.maturity = _determine_maturity( acq_date=p.datetime, processed=p.processed, wagl_doc=wagl_doc, ) # We don't bother including product maturity if it's stable, for consistency with old datasets. # Stable is the assumed default. if product_maturity is not ProductMaturity.stable: p.product_maturity = product_maturity if granule.source_level1_metadata is not None: # For historical consistency: we want to use the instrument that the source L1 product # came from, not the instruments reported from the WAGL doc. # # Eg. # Level 1 will say "OLI_TIRS", while wagl doc will say "OLI". # Our current C3 products say "OLI_TIRS" so we need to stay consistent. # (even though WAGL only *used* the OLI bands, it came from an OLI_TIRS observation) # # So delete our current wagl one, since we're adding a source dataset: if p.instrument is not None: del p.properties["eo:instrument"] p.add_source_dataset(granule.source_level1_metadata, auto_inherit_properties=True) # When level 1 is NRT, ARD is always NRT. if granule.source_level1_metadata.maturity == "nrt": p.maturity = "nrt" org_collection_number = utils.get_collection_number( p.platform, p.producer, p.properties.get("landsat:collection_number")) p.dataset_version = f"{org_collection_number}.2.1" p.region_code = _extract_reference_code(p, granule.name) _read_gqa_doc(p, granule.gqa_doc) _read_fmask_doc(p, granule.fmask_doc) if granule.s2cloudless_doc: _read_s2cloudless_doc(p, granule.s2cloudless_doc) if granule.tesp_doc: _take_software_versions(p, granule.tesp_doc) _unpack_products(p, included_products, granule_group) if include_oa: with sub_product("oa", p): with do("Starting OA", heading=True): resolution_groups = { tuple(granule_group[k].attrs["resolution"]): granule_group[k] for k in granule_group.keys() if k.startswith("RES-GROUP-") } # Use the highest resolution as the ground sample distance. if "eo:gsd" in p.properties: del p.properties["eo:gsd"] p.properties["eo:gsd"] = min( min(resolution_groups.keys())) _unpack_observation_attributes( p, get_oa_resolution_group(resolution_groups, p.platform, oa_resolution), ) infer_datetime_range = p.platform.startswith("landsat") with do("Contiguity", timedelta=infer_datetime_range): # For landsat, we want the "common" band resolution, not panchromatic. Pick lower res. if contiguity_resolution is not None: contiguity_res = contiguity_resolution elif p.platform.startswith("landsat"): contiguity_res = max(resolution_groups.keys()) elif p.platform.startswith("sentinel"): contiguity_res = (10.0, 10.0) if contiguity_res not in resolution_groups: raise ValueError( f"No resolution group {contiguity_res} found in {granule.name}." f"Options: {list(resolution_groups.keys())}") contiguity_res_grp = resolution_groups[contiguity_res] timedelta_data = ( contiguity_res_grp["SATELLITE-SOLAR/TIME-DELTA"] if infer_datetime_range else None) _create_contiguity( p, included_products, resolution_yx=tuple( contiguity_res_grp.attrs["resolution"]), timedelta_data=timedelta_data, ) if granule.fmask_image: with do(f"Writing fmask from {granule.fmask_image} "): p.write_measurement( "oa:fmask", granule.fmask_image, expand_valid_data=False, overview_resampling=Resampling.mode, # Because of our strange sub-products and filename standards, we want the # 'oa_' prefix to be included in the recorded band metadata, # but not in its filename. # So we manually calculate a filename without the extra prefix. path=p.names.measurement_filename("fmask"), ) if granule.s2cloudless_prob: with do(f"Writing s2cloudless probability from {granule.s2cloudless_prob} " ): p.write_measurement( "oa:s2cloudless_prob", granule.s2cloudless_prob, expand_valid_data=False, overview_resampling=Resampling.bilinear, path=p.names.measurement_filename( "s2cloudless-prob"), ) if granule.s2cloudless_mask: with do(f"Writing s2cloudless mask from {granule.s2cloudless_mask} " ): p.write_measurement( "oa:s2cloudless_mask", granule.s2cloudless_mask, expand_valid_data=False, overview_resampling=Resampling.mode, path=p.names.measurement_filename( "s2cloudless-mask"), ) with do("Finishing package"): return p.done()
def test_s2_naming_conventions(tmp_path: Path): """A minimal dataset with sentinel platform/instrument""" p = DatasetAssembler(tmp_path, naming_conventions="dea_s2") p.platform = "sentinel-2a" p.instrument = "msi" p.datetime = datetime(2018, 11, 4) p.product_family = "blueberries" p.processed = "2018-11-05T12:23:23" p.producer = "ga.gov.au" p.dataset_version = "1.0.0" p.region_code = "Oz" p.properties["odc:file_format"] = "GeoTIFF" p.properties[ "sentinel:sentinel_tile_id"] = "S2A_OPER_MSI_L1C_TL_SGS__20170822T015626_A011310_T54KYU_N02.05" p.note_source_datasets( "telemetry", # Accepts multiple, and they can be strings or UUIDs: "ca705033-0fc4-4f38-a47e-f425dfb4d0c7", uuid.UUID("3781e90f-b677-40af-9439-b40f6e4dfadd"), ) # The property normaliser should have extracted inner fields assert p.properties["sentinel:datatake_start_datetime"] == datetime( 2017, 8, 22, 1, 56, 26, tzinfo=timezone.utc) dataset_id, metadata_path = p.done() # The s2 naming conventions have an extra subfolder of the datatake start time. metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert metadata_path_offset == ( "ga_s2am_blueberries_1/Oz/2018/11/04/20170822T015626/" "ga_s2am_blueberries_1-0-0_Oz_2018-11-04.odc-metadata.yaml") assert_expected_eo3_path( { "$schema": "https://schemas.opendatacube.org/dataset", "accessories": {}, "id": dataset_id, "label": "ga_s2am_blueberries_1-0-0_Oz_2018-11-04", "product": { "href": "https://collections.dea.ga.gov.au/product/ga_s2am_blueberries_1", "name": "ga_s2am_blueberries_1", }, "properties": { "datetime": datetime(2018, 11, 4, 0, 0), "eo:instrument": "msi", "eo:platform": "sentinel-2a", "odc:dataset_version": "1.0.0", "odc:file_format": "GeoTIFF", "odc:processing_datetime": datetime(2018, 11, 5, 12, 23, 23), "odc:producer": "ga.gov.au", "odc:product_family": "blueberries", "odc:region_code": "Oz", "sentinel:datatake_start_datetime": datetime(2017, 8, 22, 1, 56, 26), "sentinel:sentinel_tile_id": "S2A_OPER_MSI_L1C_TL_SGS__20170822T015626_A011310_T54KYU_N02.05", }, "lineage": { "telemetry": [ "ca705033-0fc4-4f38-a47e-f425dfb4d0c7", "3781e90f-b677-40af-9439-b40f6e4dfadd", ] }, }, expected_path=metadata_path, )
def test_dea_style_package(l1_ls8_dataset: DatasetDoc, l1_ls8_dataset_path: Path, tmp_path: Path): out = tmp_path [blue_geotiff_path] = l1_ls8_dataset_path.rglob("L*_B2.TIF") with DatasetAssembler(out, naming_conventions="dea") as p: # We add a source dataset, asking to inherit the common properties (eg. platform, instrument, datetime) p.add_source_path(l1_ls8_dataset_path, auto_inherit_properties=True) # It's a GA product of "numerus-unus" ("the number one"). p.producer = "ga.gov.au" p.product_family = "ones" p.dataset_version = "3.0.0" # Known properties are normalised (see tests at bottom of file) p.platform = "LANDSAT_8" # to: 'landsat-8' p.processed = "2016-03-04 14:23:30Z" # into a date. p.maturity = "FINAL" # lowercased p.properties["eo:off_nadir"] = "34" # into a number # Write a measurement from a numpy array, using the source dataset's grid spec. p.write_measurement_numpy( "ones", numpy.ones((60, 60), numpy.int16), GridSpec.from_dataset_doc(l1_ls8_dataset), nodata=-999, ) # Copy a measurement from an input file (it will write a COG with DEA naming conventions) p.write_measurement("blue", blue_geotiff_path) # Alternatively, all measurements could be by reference rather that a copy: # p.note_measurement("external_blue", blue_geotiff_path) # (See an example of referencing in eodatasets3/prepare/landsat_l1_prepare.py ) # Write a thumbnail using the given bands as r/g/b. p.write_thumbnail("ones", "ones", "blue") # Write a singleband thumbnail using a bit flag p.write_thumbnail_singleband("blue", bit=1, kind="singleband") # Write a singleband thumbnail using a lookuptable p.write_thumbnail_singleband("blue", lookup_table={1: (0, 0, 255)}, kind="singleband_lut") # Note any software versions important to this created data. p.note_software_version( "numerus-unus-processor", "https://github.com/GeoscienceAustralia/eo-datasets", "1.2.3", ) # p.done() will validate the dataset and write it to the destination atomically. dataset_id, metadata_path = p.done() assert isinstance(dataset_id, UUID), "Expected a random UUID to be assigned" out = tmp_path / "ga_ls8c_ones_3/090/084/2016/01/21" assert out == metadata_path.parent assert_file_structure( out, { "ga_ls8c_ones_3-0-0_090084_2016-01-21_final.odc-metadata.yaml": "", "ga_ls8c_ones_3-0-0_090084_2016-01-21_final_blue.tif": "", "ga_ls8c_ones_3-0-0_090084_2016-01-21_final_ones.tif": "", "ga_ls8c_ones_3-0-0_090084_2016-01-21_final_thumbnail.jpg": "", "ga_ls8c_ones_3-0-0_090084_2016-01-21_final.proc-info.yaml": "", "ga_ls8c_ones_3-0-0_090084_2016-01-21_final.sha1": "", "ga_ls8c_singleband_3-0-0_090084_2016-01-21_final_thumbnail.jpg": "", "ga_ls8c_singleband_lut_3-0-0_090084_2016-01-21_final_thumbnail.jpg": "", }, ) # TODO: check sha1 checksum list. assert_same_as_file( { "$schema": "https://schemas.opendatacube.org/dataset", "id": dataset_id, "label": "ga_ls8c_ones_3-0-0_090084_2016-01-21_final", "product": { # This was added automatically because we chose 'dea' conventions. "href": "https://collections.dea.ga.gov.au/product/ga_ls8c_ones_3", "name": "ga_ls8c_ones_3", }, "crs": "epsg:32655", "geometry": { "coordinates": [[ [879_315.0, -3_714_585.0], [641_985.0, -3_714_585.0], [641_985.0, -3_953_115.0], [879_315.0, -3_953_115.0], [879_315.0, -3_714_585.0], ]], "type": "Polygon", }, "grids": { # Note that the two bands had identical grid specs, so it combined them into one grid. "default": { "shape": [60, 60], "transform": [ 3955.5, 0.0, 641_985.0, 0.0, -3975.500_000_000_000_5, -3_714_585.0, 0.0, 0.0, 1.0, ], } }, "measurements": { "blue": { "path": "ga_ls8c_ones_3-0-0_090084_2016-01-21_final_blue.tif" }, "ones": { "path": "ga_ls8c_ones_3-0-0_090084_2016-01-21_final_ones.tif" }, }, "properties": { "datetime": datetime(2016, 1, 21, 23, 50, 23, 54435), "dea:dataset_maturity": "final", "odc:dataset_version": "3.0.0", "odc:file_format": "GeoTIFF", "odc:processing_datetime": "2016-03-04T14:23:30", "odc:producer": "ga.gov.au", "odc:product_family": "ones", # The remaining fields were inherited from the source dataset # (because we set auto_inherit_properties=True, and they're in the whitelist) "eo:platform": "landsat-8", # matching Stac's examples for capitalisation. "eo:instrument": "OLI_TIRS", # matching Stac's examples for capitalisation. "eo:cloud_cover": 93.22, "eo:off_nadir": 34.0, "eo:gsd": 15.0, "eo:sun_azimuth": 74.007_443_8, "eo:sun_elevation": 55.486_483, "landsat:collection_category": "T1", "landsat:collection_number": 1, "landsat:landsat_product_id": "LC08_L1TP_090084_20160121_20170405_01_T1", "landsat:landsat_scene_id": "LC80900842016021LGN02", "landsat:wrs_path": 90, "landsat:wrs_row": 84, "odc:region_code": "090084", }, "accessories": { # It wrote a checksum file for all of our files. "checksum:sha1": { "path": "ga_ls8c_ones_3-0-0_090084_2016-01-21_final.sha1" }, # We didn't add any extra processor metadata, so this just contains # some software versions. "metadata:processor": { "path": "ga_ls8c_ones_3-0-0_090084_2016-01-21_final.proc-info.yaml" }, # The thumbnails we made. "thumbnail": { "path": "ga_ls8c_ones_3-0-0_090084_2016-01-21_final_thumbnail.jpg" }, "thumbnail:singleband": { "path": "ga_ls8c_singleband_3-0-0_090084_2016-01-21_final_thumbnail.jpg" }, "thumbnail:singleband_lut": { "path": "ga_ls8c_singleband_lut_3-0-0_090084_2016-01-21_final_thumbnail.jpg" }, }, "lineage": { "level1": ["a780754e-a884-58a7-9ac0-df518a67f59d"] }, }, generated_file=metadata_path, )
def test_s2_naming_conventions(tmp_path: Path): """A minimal dataset with sentinel platform/instrument""" p = DatasetAssembler(tmp_path, naming_conventions="dea_s2") p.platform = "sentinel-2a" p.instrument = "msi" p.datetime = datetime(2018, 11, 4) p.product_family = "blueberries" p.processed = "2018-11-05T12:23:23" p.producer = "ga.gov.au" p.dataset_version = "1.0.0" p.region_code = "Oz" p.properties["odc:file_format"] = "GeoTIFF" p.properties[ "sentinel:sentinel_tile_id"] = "S2A_OPER_MSI_L1C_TL_SGS__20170822T015626_A011310_T54KYU_N02.05" # The property normaliser should have extracted inner fields assert p.properties["sentinel:datatake_start_datetime"] == datetime( 2017, 8, 22, 1, 56, 26, tzinfo=timezone.utc) dataset_id, metadata_path = p.done() # The s2 naming conventions have an extra subfolder of the datatake start time. metadata_path_offset = metadata_path.relative_to(tmp_path).as_posix() assert metadata_path_offset == ( "ga_s2am_blueberries_1/Oz/2018/11/04/20170822T015626/" "ga_s2am_blueberries_1-0-0_Oz_2018-11-04.odc-metadata.yaml") assert_same_as_file( { "$schema": "https://schemas.opendatacube.org/dataset", "accessories": { "checksum:sha1": { "path": "ga_s2am_blueberries_1-0-0_Oz_2018-11-04.sha1" }, "metadata:processor": { "path": "ga_s2am_blueberries_1-0-0_Oz_2018-11-04.proc-info.yaml" }, }, "id": dataset_id, "label": "ga_s2am_blueberries_1-0-0_Oz_2018-11-04", "lineage": {}, "product": { "href": "https://collections.dea.ga.gov.au/product/ga_s2am_blueberries_1", "name": "ga_s2am_blueberries_1", }, "properties": { "datetime": datetime(2018, 11, 4, 0, 0), "eo:instrument": "msi", "eo:platform": "sentinel-2a", "odc:dataset_version": "1.0.0", "odc:file_format": "GeoTIFF", "odc:processing_datetime": datetime(2018, 11, 5, 12, 23, 23), "odc:producer": "ga.gov.au", "odc:product_family": "blueberries", "odc:region_code": "Oz", "sentinel:datatake_start_datetime": datetime(2017, 8, 22, 1, 56, 26), "sentinel:sentinel_tile_id": "S2A_OPER_MSI_L1C_TL_SGS__20170822T015626_A011310_T54KYU_N02.05", }, }, generated_file=metadata_path, )