示例#1
0
def _read_wagl_metadata(p: DatasetAssembler, granule_group: h5py.Group):
    try:
        wagl_path, *ancil_paths = [
            pth for pth in (_find_h5_paths(granule_group, "SCALAR"))
            if "METADATA" in pth
        ]
    except ValueError:
        raise ValueError("No nbar metadata found in granule")

    [wagl_doc] = loads_yaml(granule_group[wagl_path][()])

    try:
        p.processed = get_path(wagl_doc,
                               ("system_information", "time_processed"))
    except PathAccessError:
        raise ValueError(
            f"WAGL dataset contains no time processed. Path {wagl_path}")

    for i, path in enumerate(ancil_paths, start=2):
        wagl_doc.setdefault(f"wagl_{i}", {}).update(
            list(loads_yaml(granule_group[path][()]))[0]["ancillary"])

    p.properties["dea:dataset_maturity"] = _determine_maturity(
        p.datetime, p.processed, wagl_doc)

    _take_software_versions(p, wagl_doc)
    p.extend_user_metadata("wagl", wagl_doc)
示例#2
0
def _read_wagl_metadata(granule_group: h5py.Group):
    try:
        wagl_path, *ancil_paths = (
            pth for pth in _find_h5_paths(granule_group, "SCALAR")
            if "METADATA" in pth)
    except ValueError:
        raise ValueError("No nbar metadata found in granule")

    [wagl_doc] = loads_yaml(granule_group[wagl_path][()])

    for i, path in enumerate(ancil_paths, start=2):
        wagl_doc.setdefault(f"wagl_{i}", {}).update(
            list(loads_yaml(granule_group[path][()]))[0]["ancillary"])
    return wagl_doc
示例#3
0
def read_documents(*paths: Path) -> Generator[Tuple[Path, Dict], None, None]:
    """
    Read & parse documents from the filesystem (yaml or json).

    Note that a single yaml file can contain multiple documents.
    """
    for path in paths:
        suffix = path.suffix.lower()

        # If compressed, open as gzip stream.
        opener = open
        if suffix == ".gz":
            suffix = path.suffixes[-2].lower()
            opener = gzip.open

        with opener(str(path), "r") as f:
            if suffix in (".yaml", ".yml"):
                for parsed_doc in serialise.loads_yaml(f):
                    yield path, parsed_doc
            elif suffix == ".json":
                yield path, json.load(f)
            else:
                raise ValueError(
                    "Unknown document type for {}; expected one of {!r}.".
                    format(path.name, _ALL_SUPPORTED_EXTENSIONS))
示例#4
0
def validate_paths(
    paths: List[Path],
    thorough: bool = False,
    expect_extra_measurements: bool = False,
) -> Generator[Tuple[Path, int, List[ValidationMessage]], None, None]:
    """Validate the list of paths. Product documents can be specified before their datasets."""
    products: Dict[str, Dict] = {}

    for path in paths:
        with path.open("r") as f:
            inner_docs = serialise.loads_yaml(f)
            for i, doc in enumerate(inner_docs):
                if is_product(doc):
                    products[doc["name"]] = doc
                    messages = list(validate_product(doc))
                else:
                    messages = validate_eo3_doc(doc, path, products, thorough,
                                                expect_extra_measurements)
                if messages:
                    yield path, i, messages
示例#5
0
def test_prepare_ncep_reanalysis1_pr_wtr(tmpdir):
    output_path = Path(tmpdir)
    expected_metadata_path = output_path / "pr_wtr.eatm.2018.test.ga-md.yaml"

    expected_doc = [
        {
            "crs": "epsg:4236",
            "datetime": "2018-01-01T00:00:00+00:00",
            "geometry": {
                "coordinates": [[
                    [-1.25, 91.25],
                    [-1.25, -91.25],
                    [358.75, -91.25],
                    [358.75, 91.25],
                    [-1.25, 91.25],
                ]],
                "type":
                "Polygon",
            },
            "grids": {
                "default": {
                    "shape": [73, 144],
                    "transform":
                    [2.5, 0.0, -1.25, 0.0, -2.5, 91.25, 0.0, 0.0, 1.0],
                }
            },
            "id": "fb3afcb0-4301-57c5-8455-35a64e3b0c53",
            "lineage": {},
            "measurements": {
                "water_vapour": {
                    "band": 1,
                    "layer": "pr_wtr",
                    "path": "pr_wtr.eatm.2018.test.nc",
                }
            },
            "product": {
                "href":
                "https://collections.dea.ga.gov.au/noaa_c_c_prwtreatm_1"
            },
            "properties": {
                "item:providers": [{
                    "name":
                    "NOAA/OAR/ESRL PSD",
                    "roles": ["producer"],
                    "url":
                    "https://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.derived.surface.html",
                }],
                "odc:creation_datetime":
                "2019-05-15T07:29:04.948999+00:00",
                "odc:file_format":
                "NetCDF",
            },
        },
        {
            "crs": "epsg:4236",
            "datetime": "2018-01-01T06:00:00+00:00",
            "geometry": {
                "coordinates": [[
                    [-1.25, 91.25],
                    [-1.25, -91.25],
                    [358.75, -91.25],
                    [358.75, 91.25],
                    [-1.25, 91.25],
                ]],
                "type":
                "Polygon",
            },
            "grids": {
                "default": {
                    "shape": [73, 144],
                    "transform":
                    [2.5, 0.0, -1.25, 0.0, -2.5, 91.25, 0.0, 0.0, 1.0],
                }
            },
            "id": "47d52e5b-b6aa-5cb6-888d-06c8e4bfa756",
            "lineage": {},
            "measurements": {
                "water_vapour": {
                    "band": 2,
                    "layer": "pr_wtr",
                    "path": "pr_wtr.eatm.2018.test.nc",
                }
            },
            "product": {
                "href":
                "https://collections.dea.ga.gov.au/noaa_c_c_prwtreatm_1"
            },
            "properties": {
                "item:providers": [{
                    "name":
                    "NOAA/OAR/ESRL PSD",
                    "roles": ["producer"],
                    "url":
                    "https://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanalysis.derived.surface.html",
                }],
                "odc:creation_datetime":
                "2019-05-15T07:34:18.424782+00:00",
                "odc:file_format":
                "NetCDF",
            },
        },
    ]

    run_prepare_cli(
        noaa_c_c_prwtreatm_1_prepare.main,
        "--output",
        str(output_path),
        str(NCEP_PR_WTR_FILE),
    )

    assert expected_metadata_path.exists()

    with expected_metadata_path.open("r") as f:
        docs = list(serialise.loads_yaml(f))

    for idx in range(len(expected_doc)):
        doc_diff = _diff(
            expected_doc[idx],
            docs[idx],
            exclude_paths="root['properties']['odc:creation_datetime']",
        )
        assert doc_diff == {}, pformat(doc_diff)
示例#6
0
    def for_path(
        cls,
        wagl_hdf5: Path,
        granule_names: Optional[Sequence[str]] = None,
        level1_metadata_path: Optional[Path] = None,
        fmask_image_path: Optional[Path] = None,
        fmask_doc_path: Optional[Path] = None,
        gqa_doc_path: Optional[Path] = None,
    ):
        """
        Create granules by scanning the given hdf5 file.

        Optionally specify additional files and level1 path.

        If they are not specified it look for them using WAGL's output naming conventions.
        """
        if not wagl_hdf5.exists():
            raise ValueError(f"Input hdf5 doesn't exist {wagl_hdf5}")

        with h5py.File(wagl_hdf5, "r") as fid:
            granule_names = granule_names or fid.keys()

            for granule_name in granule_names:
                if granule_name not in fid:
                    raise ValueError(
                        f"Granule {granule_name!r} not found in file {wagl_hdf5}"
                    )

                wagl_doc_field = get_path(
                    fid, (granule_name, "METADATA", "CURRENT"))
                if not wagl_doc_field:
                    raise ValueError(
                        f"Granule contains no wagl metadata: {granule_name} in {wagl_hdf5}"
                    )

                [wagl_doc] = loads_yaml(wagl_doc_field[()])

                if not level1_metadata_path:
                    level1_tar_path = Path(
                        get_path(wagl_doc,
                                 ("source_datasets", "source_level1")))
                    level1_metadata_path = level1_tar_path.with_suffix(
                        ".odc-metadata.yaml")
                if not level1_metadata_path.exists():
                    raise ValueError(
                        f"No level1 metadata found at {level1_metadata_path}")

                level1 = serialise.from_path(level1_metadata_path)

                fmask_image_path = fmask_image_path or wagl_hdf5.with_name(
                    f"{granule_name}.fmask.img")
                if not fmask_image_path.exists():
                    raise ValueError(
                        f"No fmask image found at {fmask_image_path}")

                fmask_doc_path = fmask_doc_path or fmask_image_path.with_suffix(
                    ".yaml")
                if not fmask_doc_path.exists():
                    raise ValueError(f"No fmask found at {fmask_doc_path}")
                with fmask_doc_path.open("r") as fl:
                    [fmask_doc] = loads_yaml(fl)

                gqa_doc_path = gqa_doc_path or wagl_hdf5.with_name(
                    f"{granule_name}.gqa.yaml")
                if not gqa_doc_path.exists():
                    raise ValueError(f"No gqa found at {gqa_doc_path}")
                with gqa_doc_path.open("r") as fl:
                    [gqa_doc] = loads_yaml(fl)

                yield cls(
                    name=granule_name,
                    wagl_hdf5=wagl_hdf5,
                    wagl_metadata=wagl_doc,
                    source_level1_metadata=level1,
                    fmask_doc=fmask_doc,
                    fmask_image=fmask_image_path,
                    gqa_doc=gqa_doc,
                )
示例#7
0
    def for_path(
        cls,
        wagl_hdf5: Path,
        granule_names: Optional[Sequence[str]] = None,
        level1_metadata_path: Optional[Path] = None,
        fmask_image_path: Optional[Path] = None,
        fmask_doc_path: Optional[Path] = None,
        s2cloudless_prob_path: Optional[Path] = None,
        s2cloudless_mask_path: Optional[Path] = None,
        s2cloudless_doc_path: Optional[Path] = None,
        gqa_doc_path: Optional[Path] = None,
        tesp_doc_path: Optional[Path] = None,
        allow_missing_provenance: bool = False,
    ):
        """
        Create granules by scanning the given hdf5 file.

        Optionally specify additional files and level1 path.

        If they are not specified it look for them using WAGL's output naming conventions.
        :param allow_missing_provenance:
        """
        if not wagl_hdf5.exists():
            raise ValueError(f"Input hdf5 doesn't exist {wagl_hdf5}")

        with h5py.File(wagl_hdf5, "r") as fid:
            granule_names = granule_names or fid.keys()

            for granule_name in granule_names:
                if granule_name not in fid:
                    raise ValueError(
                        f"Granule {granule_name!r} not found in file {wagl_hdf5}"
                    )

                wagl_doc_field = get_path(
                    fid, (granule_name, "METADATA", "CURRENT"))
                if not wagl_doc_field:
                    raise ValueError(
                        f"Granule contains no wagl metadata: {granule_name} in {wagl_hdf5}"
                    )

                [wagl_doc] = loads_yaml(wagl_doc_field[()])

                level1 = _load_level1_doc(wagl_doc, level1_metadata_path,
                                          allow_missing_provenance)

                fmask_image_path = fmask_image_path or wagl_hdf5.with_name(
                    f"{granule_name}.fmask.img")
                if not fmask_image_path.exists():
                    raise ValueError(
                        f"No fmask image found at {fmask_image_path}")

                fmask_doc_path = fmask_doc_path or fmask_image_path.with_suffix(
                    ".yaml")
                if not fmask_doc_path.exists():
                    raise ValueError(f"No fmask found at {fmask_doc_path}")
                with fmask_doc_path.open("r") as fl:
                    [fmask_doc] = loads_yaml(fl)

                if "sentinel" in wagl_doc["source_datasets"][
                        "platform_id"].lower():
                    s2cloudless_prob_path = (
                        s2cloudless_prob_path or wagl_hdf5.with_name(
                            f"{granule_name}.prob.s2cloudless.tif"))
                    if not s2cloudless_prob_path.exists():
                        raise ValueError(
                            f"No s2cloudless probability image found at {s2cloudless_prob_path}"
                        )

                    s2cloudless_mask_path = (
                        s2cloudless_mask_path or wagl_hdf5.with_name(
                            f"{granule_name}.mask.s2cloudless.tif"))
                    if not s2cloudless_mask_path.exists():
                        raise ValueError(
                            f"No s2cloudless mask image found at {s2cloudless_mask_path}"
                        )

                    s2cloudless_doc_path = s2cloudless_doc_path or wagl_hdf5.with_name(
                        f"{granule_name}.s2cloudless.yaml")
                    if not s2cloudless_doc_path.exists():
                        raise ValueError(
                            f"No s2cloudless metadata found at {s2cloudless_doc_path}"
                        )
                    with s2cloudless_doc_path.open("r") as fl:
                        [s2cloudless_doc] = loads_yaml(fl)
                else:
                    s2cloudless_prob_path = None
                    s2cloudless_mask_path = None
                    s2cloudless_doc = None

                gqa_doc_path = gqa_doc_path or wagl_hdf5.with_name(
                    f"{granule_name}.gqa.yaml")
                if not gqa_doc_path.exists():
                    raise ValueError(f"No gqa found at {gqa_doc_path}")
                with gqa_doc_path.open("r") as fl:
                    [gqa_doc] = loads_yaml(fl)

                # Optional doc
                if tesp_doc_path:
                    # But if they gave us a path, we're strict about it existing.
                    if not tesp_doc_path.exists():
                        raise ValueError(
                            f"Supplied tesp doc path doesn't exist: {tesp_doc_path}"
                        )
                else:
                    tesp_doc_path = wagl_hdf5.with_name(
                        f"{granule_name}.tesp.yaml")
                if tesp_doc_path.exists():
                    with tesp_doc_path.open("r") as fl:
                        [tesp_doc] = loads_yaml(fl)

                yield cls(
                    name=granule_name,
                    wagl_hdf5=wagl_hdf5,
                    wagl_metadata=wagl_doc,
                    source_level1_metadata=level1,
                    fmask_doc=fmask_doc,
                    fmask_image=fmask_image_path,
                    s2cloudless_prob=s2cloudless_prob_path,
                    s2cloudless_mask=s2cloudless_mask_path,
                    s2cloudless_doc=s2cloudless_doc,
                    gqa_doc=gqa_doc,
                    tesp_doc=tesp_doc,
                )
示例#8
0
def package_non_standard(outdir, granule):
    """
    yaml creator for the ard pipeline.
    """

    outdir = Path(outdir) / granule.name
    indir = granule.wagl_hdf5.parent

    if indir.is_file():
        shutil.copy(indir, outdir)
    else:
        shutil.copytree(indir, outdir)

    wagl_h5 = outdir / str(granule.name + ".wagl.h5")
    dataset_doc = outdir / str(granule.name + ".yaml")
    boolean_h5 = Path(str(wagl_h5).replace("wagl.h5", "converted.datasets.h5"))
    fmask_img = outdir / str(granule.name + ".fmask.img")

    f = h5py.File(boolean_h5)

    with DatasetAssembler(metadata_path=dataset_doc,
                          naming_conventions="dea") as da:
        level1 = granule.source_level1_metadata
        da.add_source_dataset(level1,
                              auto_inherit_properties=True,
                              inherit_geometry=True)
        da.product_family = "ard"
        da.producer = "ga.gov.au"
        da.properties["odc:file_format"] = "HDF5"

        with h5py.File(wagl_h5, "r") as fid:
            img_paths = [ppjoin(fid.name, pth) for pth in find(fid, "IMAGE")]
            granule_group = fid[granule.name]

            try:
                wagl_path, *ancil_paths = [
                    pth for pth in find(granule_group, "SCALAR")
                    if "METADATA" in pth
                ]
            except ValueError:
                raise ValueError("No nbar metadata found in granule")

            [wagl_doc] = loads_yaml(granule_group[wagl_path][()])

            da.processed = get_path(wagl_doc,
                                    ("system_information", "time_processed"))

            platform = da.properties["eo:platform"]
            if platform == "sentinel-2a" or platform == "sentinel-2b":
                org_collection_number = 3
            else:
                org_collection_number = utils.get_collection_number(
                    platform, da.producer,
                    da.properties["landsat:collection_number"])

            da.dataset_version = f"{org_collection_number}.1.0"
            da.region_code = eodatasets3.wagl._extract_reference_code(
                da, granule.name)

            eodatasets3.wagl._read_gqa_doc(da, granule.gqa_doc)
            eodatasets3.wagl._read_fmask_doc(da, granule.fmask_doc)

            with rasterio.open(fmask_img) as ds:
                fmask_layer = "/{}/OA_FMASK/oa_fmask".format(granule.name)
                data = ds.read(1)
                fmask_ds = f.create_dataset(fmask_layer,
                                            data=data,
                                            compression="lzf",
                                            shuffle=True)
                fmask_ds.attrs["crs_wkt"] = ds.crs.wkt
                fmask_ds.attrs["geotransform"] = ds.transform.to_gdal()

                fmask_ds.attrs[
                    "description"] = "Converted from ERDAS Imagine format to HDF5 to work with the limitations of varied formats within ODC"  # noqa E501

                grid_spec = images.GridSpec(
                    shape=ds.shape,
                    transform=ds.transform,
                    crs=CRS.from_wkt(fmask_ds.attrs["crs_wkt"]),
                )

                measurement_name = "oa_fmask"

                pathname = str(outdir.joinpath(boolean_h5))

                no_data = fmask_ds.attrs.get("no_data_value")
                if no_data is None:
                    no_data = float("nan")

                da._measurements.record_image(
                    measurement_name,
                    grid_spec,
                    pathname,
                    fmask_ds[:],
                    layer="/{}".format(fmask_layer),
                    nodata=no_data,
                    expand_valid_data=False,
                )

            for pathname in img_paths:
                ds = fid[pathname]
                ds_path = Path(ds.name)

                # eodatasets internally uses this grid spec to group image datasets
                grid_spec = images.GridSpec(
                    shape=ds.shape,
                    transform=Affine.from_gdal(*ds.attrs["geotransform"]),
                    crs=CRS.from_wkt(ds.attrs["crs_wkt"]),
                )

                # product group name; lambertian, nbar, nbart, oa
                if "STANDARDISED-PRODUCTS" in str(ds_path):
                    product_group = ds_path.parent.name
                elif "INTERPOLATED-ATMOSPHERIC-COEFFICIENTS" in str(ds_path):
                    product_group = "oa_{}".format(ds_path.parent.name)
                else:
                    product_group = "oa"

                # spatial resolution group
                # used to separate measurements with the same name
                resolution_group = "rg{}".format(
                    ds_path.parts[2].split("-")[-1])

                measurement_name = ("_".join([
                    resolution_group,
                    product_group,
                    ds.attrs.get("alias", ds_path.name),
                ]).replace("-",
                           "_").lower())  # we don't wan't hyphens in odc land

                # include this band in defining the valid data bounds?
                include = True if "nbart" in measurement_name else False

                no_data = ds.attrs.get("no_data_value")
                if no_data is None:
                    no_data = float("nan")

                # if we are of type bool, we'll have to convert just for GDAL
                if ds.dtype.name == "bool":
                    pathname = str(outdir.joinpath(boolean_h5))
                    out_ds = f.create_dataset(
                        measurement_name,
                        data=np.uint8(ds[:]),
                        compression="lzf",
                        shuffle=True,
                        chunks=ds.chunks,
                    )

                    for k, v in ds.attrs.items():
                        out_ds.attrs[k] = v

                    da._measurements.record_image(
                        measurement_name,
                        grid_spec,
                        pathname,
                        out_ds[:],
                        layer="/{}".format(out_ds.name),
                        nodata=no_data,
                        expand_valid_data=include,
                    )
                else:
                    pathname = str(outdir.joinpath(wagl_h5))

                    # work around as note_measurement doesn't allow us to specify the gridspec
                    da._measurements.record_image(
                        measurement_name,
                        grid_spec,
                        pathname,
                        ds[:],
                        layer="/{}".format(ds.name),
                        nodata=no_data,
                        expand_valid_data=include,
                    )

        # the longest part here is generating the valid data bounds vector
        # landsat 7 post SLC-OFF can take a really long time
        return da.done()
示例#9
0
    def for_path(
        cls,
        wagl_hdf5: Path,
        granule_names: Optional[Sequence[str]] = None,
        level1_metadata_path: Optional[Path] = None,
        fmask_image_path: Optional[Path] = None,
        fmask_doc_path: Optional[Path] = None,
        gqa_doc_path: Optional[Path] = None,
        tesp_doc_path: Optional[Path] = None,
        allow_missing_provenance: bool = False,
    ):
        """
        Create granules by scanning the given hdf5 file.

        Optionally specify additional files and level1 path.

        If they are not specified it look for them using WAGL's output naming conventions.
        :param allow_missing_provenance:
        """
        if not wagl_hdf5.exists():
            raise ValueError(f"Input hdf5 doesn't exist {wagl_hdf5}")

        with h5py.File(wagl_hdf5, "r") as fid:
            granule_names = granule_names or fid.keys()

            for granule_name in granule_names:
                if granule_name not in fid:
                    raise ValueError(
                        f"Granule {granule_name!r} not found in file {wagl_hdf5}"
                    )

                wagl_doc_field = get_path(
                    fid, (granule_name, "METADATA", "CURRENT"))
                if not wagl_doc_field:
                    raise ValueError(
                        f"Granule contains no wagl metadata: {granule_name} in {wagl_hdf5}"
                    )

                [wagl_doc] = loads_yaml(wagl_doc_field[()])

                if not level1_metadata_path:
                    level1_metadata_path = _get_level1_metadata_path(wagl_doc)
                if level1_metadata_path and not level1_metadata_path.exists():
                    raise ValueError(
                        f"No level1 metadata found at {level1_metadata_path}")

                level1 = (serialise.from_path(level1_metadata_path)
                          if level1_metadata_path else None)
                if (not level1_metadata_path) and (
                        not allow_missing_provenance):
                    raise ValueError(
                        "No level1 found or provided. "
                        f"WAGL said it was at path {str(level1_metadata_path)!r}. "
                        "It's not, and you didn't specify an alternative. "
                        f"(allow_missing_provenance={allow_missing_provenance})"
                    )

                fmask_image_path = fmask_image_path or wagl_hdf5.with_name(
                    f"{granule_name}.fmask.img")
                if not fmask_image_path.exists():
                    raise ValueError(
                        f"No fmask image found at {fmask_image_path}")

                fmask_doc_path = fmask_doc_path or fmask_image_path.with_suffix(
                    ".yaml")
                if not fmask_doc_path.exists():
                    raise ValueError(f"No fmask found at {fmask_doc_path}")
                with fmask_doc_path.open("r") as fl:
                    [fmask_doc] = loads_yaml(fl)

                gqa_doc_path = gqa_doc_path or wagl_hdf5.with_name(
                    f"{granule_name}.gqa.yaml")
                if not gqa_doc_path.exists():
                    raise ValueError(f"No gqa found at {gqa_doc_path}")
                with gqa_doc_path.open("r") as fl:
                    [gqa_doc] = loads_yaml(fl)

                # Optional doc
                if tesp_doc_path:
                    # But if they gave us a path, we're strict about it existing.
                    if not tesp_doc_path.exists():
                        raise ValueError(
                            f"Supplied tesp doc path doesn't exist: {tesp_doc_path}"
                        )
                else:
                    tesp_doc_path = wagl_hdf5.with_name(
                        f"{granule_name}.tesp.yaml")
                if tesp_doc_path.exists():
                    with tesp_doc_path.open("r") as fl:
                        [tesp_doc] = loads_yaml(fl)

                yield cls(
                    name=granule_name,
                    wagl_hdf5=wagl_hdf5,
                    wagl_metadata=wagl_doc,
                    source_level1_metadata=level1,
                    fmask_doc=fmask_doc,
                    fmask_image=fmask_image_path,
                    gqa_doc=gqa_doc,
                    tesp_doc=tesp_doc,
                )