def test_do_not_overwrite_user_coordinates(self) -> None: orig = Dataset( coords={"x": [0, 1, 2], "y": ("x", [5, 6, 7]), "z": ("x", [8, 9, 10])}, data_vars={"a": ("x", [1, 2, 3]), "b": ("x", [3, 5, 6])}, ) orig["a"].encoding["coordinates"] = "y" orig["b"].encoding["coordinates"] = "z" enc, _ = conventions.encode_dataset_coordinates(orig) assert enc["a"].attrs["coordinates"] == "y" assert enc["b"].attrs["coordinates"] == "z" orig["a"].attrs["coordinates"] = "foo" with pytest.raises(ValueError, match=r"'coordinates' found in both attrs"): conventions.encode_dataset_coordinates(orig)
def test_multidimensional_coordinates(self): # regression test for GH1763 # Set up test case with coordinates that have overlapping (but not # identical) dimensions. zeros1 = np.zeros((1, 5, 3)) zeros2 = np.zeros((1, 6, 3)) zeros3 = np.zeros((1, 5, 4)) orig = Dataset({ 'lon1': (['x1', 'y1'], zeros1.squeeze(0), {}), 'lon2': (['x2', 'y1'], zeros2.squeeze(0), {}), 'lon3': (['x1', 'y2'], zeros3.squeeze(0), {}), 'lat1': (['x1', 'y1'], zeros1.squeeze(0), {}), 'lat2': (['x2', 'y1'], zeros2.squeeze(0), {}), 'lat3': (['x1', 'y2'], zeros3.squeeze(0), {}), 'foo1': (['time', 'x1', 'y1'], zeros1, {'coordinates': 'lon1 lat1'}), 'foo2': (['time', 'x2', 'y1'], zeros2, {'coordinates': 'lon2 lat2'}), 'foo3': (['time', 'x1', 'y2'], zeros3, {'coordinates': 'lon3 lat3'}), 'time': ('time', [0.], {'units': 'hours since 2017-01-01'}), }) orig = conventions.decode_cf(orig) # Encode the coordinates, as they would be in a netCDF output file. enc, attrs = conventions.encode_dataset_coordinates(orig) # Make sure we have the right coordinates for each variable. foo1_coords = enc['foo1'].attrs.get('coordinates', '') foo2_coords = enc['foo2'].attrs.get('coordinates', '') foo3_coords = enc['foo3'].attrs.get('coordinates', '') assert set(foo1_coords.split()) == set(['lat1', 'lon1']) assert set(foo2_coords.split()) == set(['lat2', 'lon2']) assert set(foo3_coords.split()) == set(['lat3', 'lon3']) # Should not have any global coordinates. assert 'coordinates' not in attrs
def test_multidimensional_coordinates(self) -> None: # regression test for GH1763 # Set up test case with coordinates that have overlapping (but not # identical) dimensions. zeros1 = np.zeros((1, 5, 3)) zeros2 = np.zeros((1, 6, 3)) zeros3 = np.zeros((1, 5, 4)) orig = Dataset( { "lon1": (["x1", "y1"], zeros1.squeeze(0), {}), "lon2": (["x2", "y1"], zeros2.squeeze(0), {}), "lon3": (["x1", "y2"], zeros3.squeeze(0), {}), "lat1": (["x1", "y1"], zeros1.squeeze(0), {}), "lat2": (["x2", "y1"], zeros2.squeeze(0), {}), "lat3": (["x1", "y2"], zeros3.squeeze(0), {}), "foo1": (["time", "x1", "y1"], zeros1, {"coordinates": "lon1 lat1"}), "foo2": (["time", "x2", "y1"], zeros2, {"coordinates": "lon2 lat2"}), "foo3": (["time", "x1", "y2"], zeros3, {"coordinates": "lon3 lat3"}), "time": ("time", [0.0], {"units": "hours since 2017-01-01"}), } ) orig = conventions.decode_cf(orig) # Encode the coordinates, as they would be in a netCDF output file. enc, attrs = conventions.encode_dataset_coordinates(orig) # Make sure we have the right coordinates for each variable. foo1_coords = enc["foo1"].attrs.get("coordinates", "") foo2_coords = enc["foo2"].attrs.get("coordinates", "") foo3_coords = enc["foo3"].attrs.get("coordinates", "") assert set(foo1_coords.split()) == {"lat1", "lon1"} assert set(foo2_coords.split()) == {"lat2", "lon2"} assert set(foo3_coords.split()) == {"lat3", "lon3"} # Should not have any global coordinates. assert "coordinates" not in attrs
def test_emit_coordinates_attribute_in_encoding(self) -> None: orig = Dataset( {"a": 1, "b": 1}, coords={"t": np.array("2004-11-01T00:00:00", dtype=np.datetime64)}, ) orig["a"].encoding["coordinates"] = None enc, _ = conventions.encode_dataset_coordinates(orig) # check coordinate attribute emitted for 'a' assert "coordinates" not in enc["a"].attrs assert "coordinates" not in enc["a"].encoding # check coordinate attribute not emitted for 'b' assert enc["b"].attrs.get("coordinates") == "t" assert "coordinates" not in enc["b"].encoding
def test_var_with_coord_attr(self) -> None: # regression test for GH6310 # don't overwrite user-defined "coordinates" attributes orig = Dataset( {"values": ("time", np.zeros(2), {"coordinates": "time lon lat"})}, coords={ "time": ("time", np.zeros(2)), "lat": ("time", np.zeros(2)), "lon": ("time", np.zeros(2)), }, ) # Encode the coordinates, as they would be in a netCDF output file. enc, attrs = conventions.encode_dataset_coordinates(orig) # Make sure we have the right coordinates for each variable. values_coords = enc["values"].attrs.get("coordinates", "") assert set(values_coords.split()) == {"time", "lat", "lon"} # Should not have any global coordinates. assert "coordinates" not in attrs
def _setup_rechunk( source, target_chunks, max_mem, target_store, target_options=None, temp_store=None, temp_options=None, ): if temp_options is None: temp_options = target_options target_options = target_options or {} temp_options = temp_options or {} if isinstance(source, xarray.Dataset): if not isinstance(target_chunks, dict): raise ValueError( "You must specify ``target-chunks`` as a dict when rechunking a dataset." ) variables, attrs = encode_dataset_coordinates(source) attrs = _encode_zarr_attributes(attrs) if temp_store: temp_group = zarr.group(temp_store) else: temp_group = None target_group = zarr.group(target_store) target_group.attrs.update(attrs) copy_specs = [] for name, variable in variables.items(): # This isn't strictly necessary because a shallow copy # also occurs in `encode_dataset_coordinates` but do it # anyways in case the coord encoding function changes variable = variable.copy() # Update the array encoding with provided options and apply it; # note that at this point the `options` may contain any valid property # applicable for the `encoding` parameter in Dataset.to_zarr other than "chunks" options = target_options.get(name, {}) if "chunks" in options: raise ValueError( f"Chunks must be provided in ``target_chunks`` rather than options (variable={name})" ) variable.encoding.update(options) variable = encode_zarr_variable(variable) # Extract the array encoding to get a default chunking, a step # which will also ensure that the target chunking is compatible # with the current chunking (only necessary for on-disk arrays) variable_encoding = extract_zarr_variable_encoding( variable, raise_on_invalid=False, name=name) variable_chunks = target_chunks.get(name, variable_encoding["chunks"]) # Restrict options to only those that are specific to zarr and # not managed internally options = {k: v for k, v in options.items() if k in ZARR_OPTIONS} _validate_options(options) # Extract array attributes along with reserved property for # xarray dimension names variable_attrs = _encode_zarr_attributes(variable.attrs) variable_attrs[DIMENSION_KEY] = encode_zarr_attr_value( variable.dims) copy_spec = _setup_array_rechunk( dask.array.asarray(variable), variable_chunks, max_mem, target_group, target_options=options, temp_store_or_group=temp_group, temp_options=options, name=name, ) copy_spec.write.array.attrs.update(variable_attrs) # type: ignore copy_specs.append(copy_spec) return copy_specs, temp_group, target_group elif isinstance(source, zarr.hierarchy.Group): if not isinstance(target_chunks, dict): raise ValueError( "You must specify ``target-chunks`` as a dict when rechunking a group." ) if temp_store: temp_group = zarr.group(temp_store) else: temp_group = None target_group = zarr.group(target_store) target_group.attrs.update(source.attrs) copy_specs = [] for array_name, array_target_chunks in target_chunks.items(): copy_spec = _setup_array_rechunk( source[array_name], array_target_chunks, max_mem, target_group, target_options=target_options.get(array_name), temp_store_or_group=temp_group, temp_options=temp_options.get(array_name), name=array_name, ) copy_specs.append(copy_spec) return copy_specs, temp_group, target_group elif isinstance(source, (zarr.core.Array, dask.array.Array)): copy_spec = _setup_array_rechunk( source, target_chunks, max_mem, target_store, target_options=target_options, temp_store_or_group=temp_store, temp_options=temp_options, ) intermediate = copy_spec.intermediate.array target = copy_spec.write.array return [copy_spec], intermediate, target else: raise ValueError( f"Source must be a Zarr Array, Zarr Group, Dask Array or Xarray Dataset (not {type(source)})." )