def add_uniform_time_weights(ds): """Append uniform time weights to a Dataset. All DataArrays with a time coordinate require a time weights coordinate. For Datasets read in without a time bounds coordinate or explicit time weights built in, aospy adds uniform time weights at each point in the time coordinate. Parameters ---------- ds : Dataset Input data Returns ------- Dataset """ time = ds[TIME_STR] unit_interval = time.attrs['units'].split('since')[0].strip() time_weights = xr.ones_like(time) time_weights.attrs['units'] = unit_interval del time_weights.attrs['calendar'] ds[TIME_WEIGHTS_STR] = time_weights return ds
def _test_data(grid_label="gn", z_axis=True): xt = np.arange(4) + 1 yt = np.arange(5) + 1 zt = np.arange(6) + 1 x = xr.DataArray(xt, coords=[("x", xt)]) y = xr.DataArray(yt, coords=[("y", yt)]) lev = xr.DataArray(zt, coords=[("lev", zt)]) # Need to add a tracer here to get the tracer dimsuffix coords = [("x", x.data), ("y", y.data)] data = np.random.rand(len(xt), len(yt)) dims = ["x", "y"] if z_axis: coords.append(("lev", lev.data)) data = np.random.rand(len(x), len(y), len(lev)) dims = ["x", "y", "lev"] tr = xr.DataArray( data, dims=dims, coords=coords, ) lon_raw = xr.DataArray(xt, coords=[("x", xt)]) lat_raw = xr.DataArray(yt, coords=[("y", yt)]) lon = lon_raw * xr.ones_like(lat_raw) lat = xr.ones_like(lon_raw) * lat_raw lon_bounds_e = lon + 0.5 lon_bounds_w = lon - 0.5 + (np.random.rand(*lon.shape) * 0.05) lat_bounds_n = lat + 0.5 + (np.random.rand(*lon.shape) * 0.05) lat_bounds_s = lat - 0.5 + (np.random.rand(*lon.shape) * 0.05) lon_bounds = xr.concat( [_add_small_rand(lon_bounds_w), _add_small_rand(lon_bounds_w)], dim="bnds") lat_bounds = xr.concat( [_add_small_rand(lat_bounds_s), _add_small_rand(lat_bounds_n)], dim="bnds") if z_axis: lev_bounds = xr.concat( [_add_small_rand(lev - 0.5), _add_small_rand(lev + 0.5)], dim="bnds") lon_verticies = xr.concat( [ _add_small_rand(lon_bounds_e), _add_small_rand(lon_bounds_e), _add_small_rand(lon_bounds_w), _add_small_rand(lon_bounds_w), ], dim="vertex", ) lat_verticies = xr.concat( [ _add_small_rand(lat_bounds_s), _add_small_rand(lat_bounds_n), _add_small_rand(lat_bounds_n), _add_small_rand(lat_bounds_s), ], dim="vertex", ) ds = xr.Dataset({"base": tr}) dataset_coords = dict( lon=lon, lat=lat, lon_bounds=lon_bounds, lat_bounds=lat_bounds, lon_verticies=lon_verticies, lat_verticies=lat_verticies, ) if z_axis: dataset_coords["lev_bounds"] = lev_bounds ds = ds.assign_coords(dataset_coords) ds.attrs["source_id"] = "test_model" ds.attrs["grid_label"] = grid_label ds.attrs["variable_id"] = "base" return ds
def run(self): # for `num_cells` operation the field shouldn't be given because the # number of cells is just computed from the mask inputs = self.input() da_objects = inputs["objects"].open() if self.op != "num_cells": da_field = inputs["field"].open().squeeze() else: if self.field_name is not None: raise Exception( f"Field name should not be given when computing `{self.op}`" f" (`{self.field_name}` was provided)" ) da_field = None object_ids = np.unique(da_objects.chunk(None).values) if object_ids[0] == 0: object_ids = object_ids[1:] kwargs = dict( objects=da_objects.name, object_ids=object_ids, op=self.op, ) if self.op != "num_cells": kwargs["scalar"] = da_field.name da_objects_ = da_objects.sel(zt=self.z).compute() if self.op != "num_cells": da_ = da_field.sel(zt=self.z).compute() else: da_ = xr.ones_like(da_objects_) # to avoid the confusion where the "area" is requested but what in fact # is returned is the "number of cells" (which is dimensionless) we # enforce here that the "area" cannot be calculated, but instead # "num_cells" can be requested and we use the `area` dask-image op # (which returns the number of cells) op = kwargs["op"] if op == "area": raise Exception( "Shouldn't ask for `area` as it asctually the number of cells" ) elif op == "num_cells": op = "area" fn = getattr(dask_image.ndmeasure, op) v = fn(da_, label_image=da_objects_, index=object_ids).compute() da = xr.DataArray(data=v, dims=["object_id"], coords=dict(object_id=object_ids)) if self.op != "num_cells": da.name = "{}__{}".format(da_.name, kwargs["op"]) da.attrs["units"] = da_.units da.attrs["long_name"] = "{} of {} per object".format( kwargs["op"], da_.long_name, ) else: da.name = "num_cells" da.attrs["units"] = "1" da.attrs["long_name"] = "num_cells per object" da.coords["zt"] = self.z da.coords["time"] = da_objects_.time da.to_netcdf(self.output().fn)
def test_ds_to_np( self, tmp_path, normalize, to_tensor, experiment, surrounding_pixels, predict_delta, ): x_pred, _, _ = _make_dataset(size=(5, 5), const=True) x_coeff1, _, _ = _make_dataset(size=(5, 5), variable_name="precip") x_coeff2, _, _ = _make_dataset(size=(5, 5), variable_name="soil_moisture") x_coeff3, _, _ = _make_dataset(size=(5, 5), variable_name="temp") x = xr.merge([x_pred, x_coeff1, x_coeff2, x_coeff3]) y = x_pred.isel(time=[0]) data_dir = tmp_path / experiment / "1980_1" if not data_dir.exists(): data_dir.mkdir(parents=True, exist_ok=True) x.to_netcdf(data_dir / "x.nc") y.to_netcdf(data_dir / "y.nc") norm_dict = {} for var in x.data_vars: norm_dict[var] = { "mean": float(x[var].mean(dim=["lat", "lon", "time"], skipna=True).values), # we clip the std because since constant=True, the std=0 for VHI, # giving NaNs which mess the tests up "std": float( np.clip( a=x[var].std(dim=["lat", "lon", "time"], skipna=True).values, a_min=1, a_max=None, )), } # build static data static1 = x.mean(dim="time").rename( {v: f"{v}_pixel_mean" for v in x.data_vars}) ones = xr.ones_like(x.mean(dim="time"))[[v for v in x.data_vars][0]] static2 = x.mean(dim=["lat", "lon", "time"]).rename( {v: f"{v}_global_mean" for v in x.data_vars}) static2 = static2 * ones static_ds = xr.auto_combine([static1, static2]) class MockLoader: def __init__(self): self.batch_file_size = None self.mode = None self.shuffle = None self.clear_nans = None self.data_files = [] self.normalizing_dict = norm_dict if normalize else None self.to_tensor = None self.experiment = experiment self.surrounding_pixels = surrounding_pixels self.predict_delta = predict_delta self.ignore_vars = ["precip"] self.monthly_aggs = False self.device = torch.device("cpu") self.incl_yearly_aggs = False self.static = static_ds self.spatial_mask = None self.static_normalizing_dict = None self.normalize_y = normalize base_iterator = _BaseIter(MockLoader()) arrays = base_iterator.ds_folder_to_np(data_dir, to_tensor=to_tensor) x_train_data, y_np, latlons = (arrays.x, arrays.y, arrays.latlons) # ---------------------- # Test the static data # ---------------------- # check first 3 features are CONSTANT (global means) assert all([ all(arrays.x.static[:, i][1:] == arrays.x.static[:, i][:-1]) for i in range(4) ]) if not predict_delta: # check second 3 features vary (pixel means) assert all([ all(arrays.x.static[:, i][1:] != arrays.x.static[:, i][:-1]) for i in range(4, 6) ]), (f"static data: \n[,4]\n: {arrays.x.static[:, 4][1:]}\n[,5]" f"\n: {arrays.x.static[:, 5][1:]}") n_samples = 25 if surrounding_pixels is None else 9 assert (arrays.x.static.shape[0] == n_samples ), f"Expect {n_samples} samples because ..." assert ( arrays.x.static.shape[-1] == 6 ), "Expect 6 static features because ignore 'precip' variables in the static data" # ---------------------- # Test the TrainData # ---------------------- assert isinstance(x_train_data, TrainData) if not to_tensor: assert isinstance(y_np, np.ndarray) expected_features = 3 if surrounding_pixels is None else 3 * 9 assert x_train_data.historical.shape[-1] == expected_features, ( f"There should be 4 historical variables " f"(the final dimension): {x_train_data.historical.shape}") if experiment == "nowcast": expected_shape = (25, 2) if surrounding_pixels is None else (9, 2 * 9) assert x_train_data.current.shape == expected_shape, ( f"Expecting multiple vars in the current timestep. " f"Expect: (25, 5) Got: {x_train_data.current.shape}") expected_latlons = 25 if surrounding_pixels is None else 9 assert latlons.shape == (expected_latlons, 2), ( "The shape of " "latlons should not change" f"Got: {latlons.shape}. Expecting: (25, 2)") assert x_train_data.latlons.shape == (expected_latlons, 2), ( "The shape of " "latlons should not change" f"Got: {latlons.shape}. Expecting: (25, 2)") if normalize and (experiment == "nowcast") and (not to_tensor): assert x_train_data.current.max() < 6, ( f"The current data should be" f" normalized. Currently: {x_train_data.current.flatten()}") if to_tensor: assert (type(x_train_data.historical) == torch.Tensor) and (type(y_np) == torch.Tensor) else: assert (type(x_train_data.historical) == np.ndarray) and (type(y_np) == np.ndarray) if (not normalize) and (experiment == "nowcast") and (not to_tensor): assert x_train_data.historical.shape[ 0] == x_train_data.current.shape[0], ( "The 0th dimension (latlons) should be equal in the " f"historical ({x_train_data.historical.shape[0]}) and " f"current ({x_train_data.current.shape[0]}) arrays.") expected = (x[["soil_moisture", "temp"]].sel(time=y.time).stack( dims=["lat", "lon"]).to_array().values.T[:, 0, :]) got = x_train_data.current if surrounding_pixels is None: assert expected.shape == got.shape, ( "should have stacked latlon" " vars as the first dimension in the current array.") assert (expected == got).all(), ( "" "Expected to find the target timesetep of `precip` values" "(the non-target variable for the target timestep: " f"({pd.to_datetime(y.time.values).strftime('%Y-%m-%d')[0]})." f"Expected: {expected[:5]}. \nGot: {got[:5]}") for idx in range(latlons.shape[0]): lat, lon = latlons[idx, 0], latlons[idx, 1] for time in range(x_train_data.historical.shape[1]): target = x.isel(time=time).sel(lat=lat).sel(lon=lon).VHI.values if (not normalize) and (not to_tensor): assert target == x_train_data.historical[idx, time, 0], ( "Got different x values for time idx:" f"{time}, lat: {lat}, lon: {lon} Expected {target}, " f"got {x_train_data.historical[idx, time, 0]}") if ((not normalize) and (experiment == "nowcast") and (surrounding_pixels is None)): # test that we are getting the right `current` data relevant_features = ["soil_moisture", "temp"] target_time = y.time expected = ( x[relevant_features] # all vars except target_var and the ignored var .sel(time=target_time) # select the target_time .stack(dims=[ "lat", "lon" ]) # stack lat,lon so shape = (lat*lon, time, dims) .to_array().values[:, 0, :]. T # extract numpy array, transpose and drop dim ) assert np.all(x_train_data.current == expected), ( f"Expected to " "find the target_time data for the non target variables") if x_train_data.yearly_aggs is not None: # n_variables should be 3 because `ignoring` precip assert x_train_data.yearly_aggs.shape[1] == 3 if (not normalize) and (not to_tensor): mean_temp = x_coeff3.temp.mean(dim=["time", "lat", "lon"]).values if x_train_data.yearly_aggs is not None: assert (mean_temp == x_train_data.yearly_aggs).any() if predict_delta: assert (y_np == 0).all( ), "The derivatives should be 0 for a constant input." assert (base_iterator.predict_delta ), "should have set model_ derivative to True"
def arct_connect(ds, varName, all_faces): arc_cap = 6 Nx_ac_nrot = [] Ny_ac_nrot = [] Nx_ac_rot = [] Ny_ac_rot = [] ARCT = [] arc_faces = [] metrics = ["dxC", "dyC", "dxG", "dyG"] if arc_cap in all_faces: for k in all_faces: if k == 2: fac = 1 arc_faces.append(k) _varName = varName DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(DIMS[::-1]) dtr = list(dims)[::-1] dtr[-1], dtr[-2] = dtr[-2], dtr[-1] mask2 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) # TODO: Eval where, define argument outside mask2 = mask2.where( _np.logical_and( ds[dims.X] < ds[dims.Y], ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y], )) x0, xf = 0, int(len(ds[dims.Y]) / 2) # TODO: CHECK here! y0, yf = 0, int(len(ds[dims.X])) xslice = slice(x0, xf) yslice = slice(y0, yf) Nx_ac_nrot.append(0) Ny_ac_nrot.append(len(ds[dims.Y][y0:yf])) da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice} sort_arg = {"variables": dims.Y, "ascending": False} mask_arg = {dims.X: xslice, dims.Y: yslice} if len(dims.X) + len(dims.Y) == 4: if len(dims.Y) == 1 and _varName not in metrics: fac = -1 if "mates" in list(ds[_varName].attrs): _varName = ds[_varName].attrs["mates"] _DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(_DIMS[::-1]) dtr = list(dims)[::-1] dtr[-1], dtr[-2] = dtr[-2], dtr[-1] mask2 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) mask2 = mask2.where( _np.logical_and( ds[dims.X] < ds[dims.Y], ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y], )) da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice} sort_arg = {"variables": dims.Y, "ascending": False} mask_arg = {dims.X: xslice, dims.Y: yslice} arct = fac * ds[_varName].isel(**da_arg) arct = arct.sortby(**sort_arg) Mask = mask2.isel(**mask_arg) Mask = Mask.sortby(**sort_arg) arct = (arct * Mask).transpose(*dtr) ARCT.append(arct) elif k == 5: fac = 1 arc_faces.append(k) _varName = varName DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(DIMS[::-1]) mask5 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) mask5 = mask5.where( _np.logical_and( ds[dims.X] > ds[dims.Y], ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y], )) x0, xf = 0, int(len(ds[dims.X])) y0, yf = 0, int(len(ds[dims.Y]) / 2) xslice = slice(x0, xf) yslice = slice(y0, yf) Nx_ac_nrot.append(0) Ny_ac_nrot.append(len(ds[dims.X][y0:yf])) da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice} mask_arg = {dims.X: xslice, dims.Y: yslice} arct = ds[_varName].isel(**da_arg) Mask = mask5.isel(**mask_arg) arct = arct * Mask ARCT.append(arct) elif k == 7: fac = 1 arc_faces.append(k) _varName = varName DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(DIMS[::-1]) dtr = list(dims)[::-1] dtr[-1], dtr[-2] = dtr[-2], dtr[-1] mask7 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) mask7 = mask7.where( _np.logical_and( ds[dims.X] > ds[dims.Y], ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y], )) x0, xf = int(len(ds[dims.Y]) / 2), int(len(ds[dims.Y])) y0, yf = 0, int(len(ds[dims.X])) xslice = slice(x0, xf) yslice = slice(y0, yf) Nx_ac_rot.append(len(ds[dims.Y][x0:xf])) Ny_ac_rot.append(0) if len(dims.X) + len(dims.Y) == 4: if len(dims.X) == 1 and _varName not in metrics: fac = -1 if "mates" in list(ds[varName].attrs): _varName = ds[varName].attrs["mates"] DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(DIMS[::-1]) dtr = list(dims)[::-1] dtr[-1], dtr[-2] = dtr[-2], dtr[-1] mask7 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) mask7 = mask7.where( _np.logical_and( ds[dims.X] > ds[dims.Y], ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y], )) da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice} mask_arg = {dims.X: xslice, dims.Y: yslice} arct = fac * ds[_varName].isel(**da_arg) Mask = mask7.isel(**mask_arg) arct = (arct * Mask).transpose(*dtr) ARCT.append(arct) elif k == 10: fac = 1 _varName = varName DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(DIMS[::-1]) arc_faces.append(k) mask10 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) mask10 = mask10.where( _np.logical_and( ds[dims.X] < ds[dims.Y], ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y], )) x0, xf = 0, int(len(ds[dims.X])) y0, yf = int(len(ds[dims.Y]) / 2), int(len(ds[dims.Y])) xslice = slice(x0, xf) yslice = slice(y0, yf) Nx_ac_rot.append(0) Ny_ac_rot.append(len(ds[dims.Y][y0:yf])) if len(dims.X) + len(dims.Y) == 4: if _varName not in metrics: fac = -1 da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice} sort_arg = {"variables": [dims.X], "ascending": False} mask_arg = {dims.X: xslice, dims.Y: yslice} arct = fac * ds[_varName].isel(**da_arg) arct = arct.sortby(**sort_arg) Mask = mask10.isel(**mask_arg) Mask = Mask.sortby(**sort_arg) arct = arct * Mask ARCT.append(arct) return arc_faces, Nx_ac_nrot, Ny_ac_nrot, Nx_ac_rot, Ny_ac_rot, ARCT
df.name = name df = df.reset_index().rename(columns={0: "year", "level_1": "month"}) # create datetime index df["time"] = df.apply(lambda x: pd.to_datetime(f"{x.year}-{x.month}"), axis=1) df = df.set_index("time").drop(columns=["year", "month"]) # replace missing data df = df.astype({name: float}).replace(-99.99, np.nan) # resample to month end (same as other data) df = df.resample("M").first() # ----------------- # save to xarray / .nc # ----------------- data_dir = Path("data") vci = xr.open_dataset( data_dir / "interim/boku_ndvi_1000_preprocessed/data_kenya.nc")["boku_VCI"] # for each MONTH TIMESTEP multiply by the nino value nino_xr = xr.ones_like(vci) nino_ts = df.loc[nino_xr.time.values] nino_xr = nino_xr * pd.DataFrame.to_xarray(nino_ts) if not (data_dir / "analysis/sst").exists(): (data_dir / "analysis/sst").mkdir(parents=True, exist_ok=True) # save to netcdf nino_xr.to_netcdf(data_dir / f"analysis/sst/data_{name}.nc")
def simulate_and_regress( pop, no_policy_growth_rate, p_effects, p_lags, p_start_interval, n_days, tsteps_per_day, n_samples, LHS_vars, reg_lag_days, gamma_to_test, min_cases, sigma_to_test=[np.nan], measurement_noise_on=False, measurement_noise_sd=0, beta_noise_on=False, beta_noise_sd=0, sigma_noise_on=False, sigma_noise_sd=0, gamma_noise_on=False, gamma_noise_sd=0, kind="SEIR", E0=1, I0=0, R0=0, random_end=False, ordered_policies=True, save_dir=None, ): """Full wrapper to run Monte Carlo simulations of a disease outbreak using SEIR or SIR dynamics for a number of parameter sets. Parameters ---------- pop : int Population to use for these sets of simulation no_policy_growth_rate : float Continuous (asymptotic if SEIR) daily growth rate of infections given no policy, prior to adding noise. p_effects : list of float Magnitude of the effect of each policy being applied in the simulations to the (asymptotic) continuous growth rate p_lags : list of list of float Lagged policy effect in *dynamic simulation*. This occurs if you assume behavior takes some time to respond to a policy, and thus the affect of the policy on $\beta$ is delayed. The outer dimension of this list refers to the different policies in the simulation. The inner is a list of floats between 0 and 1, with each element corresponding to one day after the policy. For isntance, the value ``[[.5,.75],[]]`` means that there are 2 policies. The first has .5 of it's total effect (as indicated in `p_effects`) occuring the day the policy is implemented, .75 the day after, and then reaches its full effect on $\beta$ on the third day. The second policy has an immediate, full effect on $\beta$ p_start_interval : list of int ``[start_date, end_date]`` providing the bounds within which each policy is allowed to begin. The start date is applied uniformly randomly within this interval for each Monte Carlo draw. n_days : int How many days to run in the simulation tsteps_per_day : int n_samples : int Number of MC draws per parameter set being simulated. In each draw, there will be different noise added to parameters and measurements, different starting days for policies, and/or different end days for the data used in regression LHS_vars : list of str The left hand side variables you want to use to estimate regressions of impact of policy on outbreak growth rate. These are in "SEIR" terminology, i.e. ``I`` is active infectious cases. Sums are accomplished by concatenating letters (e.g. ``IR`` is active infectious cases + recovered/dead cases). reg_lag_days : list of int, optional Lags to include in the regression model for each policy (in days). [0] means no lags. [gamma,sigma]_to_test : list of float Rate parameters to test. Each one will get `n_samples` MC draws. These are defined as continous daily rates min_cases : int Minimum cumulative cases that are needed before the beginning of the timeseries that gets used in regression. However, when this is violated, the regression simply begins 2 days before the first policy. If this happens in many MC draws, this will bias the estimate of no-policy growth rate more than we would see in the regressions on actual data, because it will be further from its asymptotic steady-state growth rate. [measurement,beta,gamma,sigma]_noise_on : "exponential", "normal", or False What type of noise to apply to each parameter during the dynamic simulation and/or to the daily measurements of log-difference (just prior to regression). False means no noise [measurement,beta,gamma,sigma]_noise_sd : float Standard deviation of noise to apply. Only used if ``[varname]_noise_on=="normal"`` kind : "SIR" or "SEIR" E0,I0,R0 : int, optional Initial conditions (in number of people). Default is ``I0=1`` and all others 0. random_end : bool, optional Whether to allow each regression to end at the end of the data sample (False) or to randomly cut off the end of the timeseries at some day between the day after the start of the last policy and the end of the sample. Default False. ordered_policies : bool, optional Whether you want the first policy to always be enacted before the second, which is enacted before the third, etc. Default is yes. save_dir : str or :class:`pathlib.Path` The directory to save results Returns ------- daily_ds : :class:`xarray.Dataset` A dataset with all relevant information from each MC draw, both dynamically simulated states and regression outputs. """ attrs = dict( E0=E0, I0=I0, R0=R0, pop=pop, min_cases=min_cases, measurement_noise_on=str(measurement_noise_on), beta_noise_on=str(beta_noise_on), gamma_noise_on=str(gamma_noise_on), measurement_noise_sd=measurement_noise_sd, beta_noise_sd=beta_noise_sd, gamma_noise_sd=gamma_noise_sd, no_policy_growth_rate=no_policy_growth_rate, tsteps_per_day=tsteps_per_day, p_effects=p_effects, ) if save_dir is not None: save_dir = Path(save_dir) E0 = E0 / pop I0 = I0 / pop R0 = R0 / pop ## setup if kind == "SEIR": attrs["sigma_noise_on"] = str(sigma_noise_on) attrs["sigma_noise_sd"] = sigma_noise_sd sim_engine = run_SEIR get_beta = get_beta_SEIR ics = [E0, I0, R0] elif kind == "SIR": sigma_to_test = [np.nan] sigma_noise_on = False sim_engine = run_SIR get_beta = get_beta_SIR ics = [I0, R0] LHS_vars = [l for l in LHS_vars if "E" not in l] # get time vector ttotal = n_days * tsteps_per_day + 1 t = np.linspace(0, 1, ttotal) * n_days # store policy info policies = xr.Dataset( coords={ "policy": [f"p{i+1}" for i in range(len(p_effects))], "time": ["start", "end"], "lag_num": range(len(p_lags[0])), }, data_vars={ "effect": (("policy", ), p_effects), "lag": (("policy", "lag_num"), p_lags), "interval": (("time", ), p_start_interval), }, ) # initialize results array estimates_ds = init_reg_ds( n_samples, LHS_vars, policies.policy.values, gamma=gamma_to_test, sigma=sigma_to_test, ) # get policy effects policy_dummies, random_end_da = init_policy_dummies( policies, n_samples, t, seed=0, random_end=random_end, ordered_policies=ordered_policies, ) policies = xr.merge((policies, policy_dummies, random_end_da)) policy_effect_timeseries = (policies.policy_timeseries * policies.effect).sum("policy") n_samp_valid = len(policies.sample) # adjust rate params to correct timestep estimates_ds = adjust_timescales_from_daily(estimates_ds, t[1] - t[0]) beta_noise_sd = beta_noise_sd / np.sqrt(tsteps_per_day) gamma_noise_sd = gamma_noise_sd / np.sqrt(tsteps_per_day) sigma_noise_sd = sigma_noise_sd / np.sqrt(tsteps_per_day) # get stochastic params estimates_ds = get_stochastic_discrete_params( estimates_ds, no_policy_growth_rate, policy_effect_timeseries, t, beta_noise_on, beta_noise_sd, kind=kind, gamma_noise_on=gamma_noise_on, gamma_noise_sd=gamma_noise_sd, sigma_noise_on=sigma_noise_on, sigma_noise_sd=sigma_noise_sd, ) # run simulation estimates_ds = sim_engine(*ics, estimates_ds) # add on other potentially observable quantities estimates_ds["IR"] = estimates_ds["R"] + estimates_ds["I"] if kind == "SEIR": estimates_ds["EI"] = estimates_ds["E"] + estimates_ds["I"] estimates_ds["EIR"] = estimates_ds["EI"] + estimates_ds["R"] # get minimum S for each simulation # at end and when the last policy turns on estimates_ds["S_min"] = estimates_ds.S.isel(t=-1) p3_on = (policies.policy_timeseries > 0).argmax(dim="t").max(dim="policy") estimates_ds["S_min_p3"] = estimates_ds.S.isel(t=p3_on) # blend in policy dataset estimates_ds = estimates_ds.merge(policies) # convert to daily observations daily_ds = adjust_timescales_to_daily(estimates_ds) # prep regression LHS vars (logdiff) daily_ds["logdiff"] = (np.log(daily_ds[daily_ds.LHS.values]).diff( dim="t", n=1, label="lower").pad(t=(0, 1)).to_array(dim="LHS")) if "sigma" not in daily_ds.logdiff.dims: daily_ds["logdiff"] = daily_ds.logdiff.expand_dims("sigma") # add noise daily_ds = add_obs_noise( daily_ds, measurement_noise_on=measurement_noise_on, measurement_noise_sd=measurement_noise_sd, ) ## run regressions estimates = np.empty( ( len(daily_ds.gamma), len(daily_ds.sigma), len(daily_ds.sample), len(daily_ds.LHS), len(daily_ds.policy) * len(reg_lag_days) + 1, ), dtype=np.float32, ) mses = np.empty( ( len(daily_ds.gamma), len(daily_ds.sigma), len(daily_ds.sample), len(daily_ds.LHS), ), dtype=np.float32, ) estimates.fill(np.nan) mses.fill(np.nan) # add on lags RHS_old = (daily_ds.policy_timeseries > 0).astype(int) RHS_ds = xr.ones_like(RHS_old.isel(policy=0)) RHS_ds["policy"] = "Intercept" for l in reg_lag_days: lag_vars = RHS_old.shift(t=l, fill_value=0) lag_vars["policy"] = [f"{x}_lag{l}" for x in RHS_old.policy.values] RHS_ds = xr.concat((RHS_ds, lag_vars), dim="policy") # Apply min cum_cases threshold used in regressions valid_reg = daily_ds.IR >= min_cases / pop if "sigma" not in valid_reg.dims: valid_reg = valid_reg.expand_dims("sigma") valid_reg["sigma"] = [np.nan] # only run regression on planned start day if we have at least one "no-policy" day after that # otherwise, start regression 2 days before first policy any_pol = (RHS_old > 0).max(dim="policy") first_pol = any_pol.argmax(dim="t") no_pol_on_regday0 = first_pol > valid_reg.argmax(dim="t") backup = any_pol.shift({"t": -2}).astype(bool) backup = backup | backup.isnull() # find random last day to end regression, starting with 1 day after last policy # is implemented if random_end: last_pol = (daily_ds.policy_timeseries.sum(dim="policy") == 3).argmax( dim="t") last_reg_day = (( (daily_ds.dims["t"] - (last_pol + 1)) * daily_ds.random_end).round().astype(int) + last_pol + 1) else: last_reg_day = daily_ds.dims["t"] daily_ds["random_end"] = last_reg_day # loop through regressions for cx, case_var in enumerate(daily_ds.LHS.values): case_ds = daily_ds.logdiff_stoch.sel(LHS=case_var) for gx, g in enumerate(daily_ds.gamma.values): g_ds = case_ds.sel(gamma=g) for sx, s in enumerate(daily_ds.sigma.values): s_ds = g_ds.sel(sigma=s) for samp in daily_ds.sample.values: if no_pol_on_regday0.isel(sample=samp, gamma=gx, sigma=sx): this_valid = valid_reg.isel(sample=samp, gamma=gx, sigma=sx) else: this_valid = backup.isel(sample=samp) if random_end: this_valid = (this_valid) & (RHS_ds.t <= last_reg_day[samp]) LHS = s_ds.isel(sample=samp)[this_valid].values RHS = add_constant( RHS_ds.isel(sample=samp)[{ "t": this_valid }].values) res = OLS(LHS, RHS, missing="drop").fit() estimates[gx, sx, samp, cx] = res.params mses[gx, sx, samp, cx] = res.mse_resid coords = OrderedDict( gamma=daily_ds.gamma, sigma=daily_ds.sigma, sample=daily_ds.sample, LHS=daily_ds.LHS, ) rmse_ds = xr.DataArray(np.sqrt(mses), coords=coords, dims=coords.keys()) coords["policy"] = RHS_ds.policy e = xr.DataArray(estimates, coords=coords, dims=coords.keys()).to_dataset("policy") coeffs = [] for p in daily_ds.policy.values: keys = [i for i in e.variables.keys() if f"{p}_" in i] coeffs.append(e[keys].rename( {k: int(k.split("_")[-1][3:]) for k in keys}).to_array(dim="reg_lag")) coef_ds = xr.concat(coeffs, dim="policy") coef_ds.name = "coefficient" daily_ds = daily_ds.drop("coefficient").merge(coef_ds) daily_ds["Intercept"] = e["Intercept"] daily_ds["rmse"] = rmse_ds # add model params daily_ds.attrs = attrs if save_dir is not None: save_dir.mkdir(exist_ok=True, parents=True) fname = f"pop_{int(pop)}_lag_{'-'.join([str(s) for s in reg_lag_days])}.nc" daily_ds.to_netcdf(save_dir / fname) return daily_ds
from python.aux.floodmodels import LocalModel, FlowModel # In[45]: # ### Spatial feature selection # In contrast to the transport model we have no background info which gridpoints influence the predictand the most, so we use dimensionality reduction approach and/or let the model decide which gridpoints are most relevant. # # The only hard constraint for the LocalModel is the influence radius of 1.5 degrees latitude/longitude, about 170 km and that the gridpoints have to lie within the catchment basin of the point. # In[46]: from python.aux.utils_flowmodel import get_mask_of_basin # In[47]: map = xr.ones_like(glofas['dis'].isel(time=0).drop('time')) mask_catchment = get_mask_of_basin(map, 'Danube') if debug: plt.imshow(mask_catchment.astype(int)) plt.title('Catchment basin of the Danube river') plt.show() # In[48]: def select_riverpoints(dis): return (dis > 10) #.drop('time') # In[49]:
def _wrap_butterworth(data, coord, freq, kind, cycles_per="s", order=2, debug=False, gappy=None, **kwargs): """ Inputs ------ data : xr.DataArray coord : coordinate along which to filter freq : "frequencies" for filtering cycles_per: optional Units for frequency order : optional Butterworth filter order kwargs : dict, optional passed down to gappy_filter Outputs ------- filtered : xr.DataArray """ # if len(data.dims) > 1 and coord is None: # raise ValueError('Specify coordinate along which to filter') # else: # coord = data.coords[0] if _is_datetime_like(data[coord]): dx = _process_time(data[coord], cycles_per) else: dx = np.diff(data[coord][0:2].values) b, a = signal.butter(order, freq * dx / (1 / 2), btype=kind) data = data.copy().transpose(..., coord) if debug: import dcpy.ts import matplotlib.pyplot as plt f, ax = plt.subplots(2, 1, constrained_layout=True) data.plot(x=coord, ax=ax[0]) dcpy.ts.PlotSpectrum(data, cycles_per=cycles_per, ax=ax[1]) if data.chunks: chunks = dict(zip(data.dims, data.chunks)) if len(chunks[coord]) > 1: use_overlap = True else: use_overlap = False else: use_overlap = False if gappy is not None: warnings.warn( UserWarning, "'gappy' kwarg is now deprecated and completely ignored.") num_discard = kwargs.pop("num_discard", "auto") kwargs.setdefault("method", "gust") if kwargs["method"] == "gust" and "irlen" not in kwargs: kwargs["irlen"] = estimate_impulse_response_len(b, a) kwargs.update(b=b, a=a, axis=-1) valid = data.notnull() if np.issubdtype(data.dtype, np.dtype(complex)): filled = data.real.ffill(coord).bfill( coord) + 1j * data.imag.ffill(coord).bfill(coord) else: filled = data.ffill(coord).bfill(coord) # I need distance from nearest NaN index = np.arange(data.sizes[coord]) arange = xr.ones_like(data.reset_coords(drop=True), dtype=int) * index invalid_arange = ( arange.where(~valid).interpolate_na( coord, "nearest", fill_value="extrapolate").fillna(-1) # when all points are valid ) distance = np.abs(arange - invalid_arange).where(valid) if not use_overlap: filtered = xr.apply_ufunc( filter_, filled, input_core_dims=[[coord]], output_core_dims=[[coord]], dask="parallelized", output_dtypes=[data.dtype], kwargs=kwargs, ) else: import dask if not isinstance(data, xr.DataArray): raise ValueError("map_overlap implemented only for DataArrays.") irlen = estimate_impulse_response_len(b, a) axis = data.get_axis_num(coord) overlap = np.round(2 * irlen).astype(int) min_chunksize = 3 * overlap actual_chunksize = data.data.chunksize[axis] if actual_chunksize < min_chunksize: raise ValueError( f"Chunksize along {coord} = {actual_chunksize} < {min_chunksize}. Please rechunk" ) depth = dict(zip(range(data.ndim), [0] * data.ndim)) depth[data.ndim - 1] = overlap filtered = data.copy(data=dask.array.map_overlap( filled.data, filter_, depth=depth, boundary="none", meta=filled.data._meta, **kwargs, )) # take out the beginning and end if necessary mask = xr.DataArray( np.ones((filtered.sizes[coord], ), dtype=bool), dims=[coord], name=coord, coords={coord: filtered[coord]}, ) num_discard = _get_num_discard(kwargs, num_discard) if num_discard > 0: mask[:num_discard] = False mask[-num_discard:] = False filtered = filtered.where((distance >= num_discard) & mask) if debug: filtered.plot(x=coord, ax=ax[0]) ylim = ax[1].get_ylim() dcpy.ts.PlotSpectrum(filtered, cycles_per=cycles_per, ax=ax[1]) ax[1].set_ylim(ylim) for ff in np.array(freq, ndmin=1): plt.axvline(ff) return filtered
def init_z_level_vertical_coord(config, ds): """ Create a z-level vertical coordinate based on the config options in the ``vertical_grid`` section and the ``bottomDepth`` and ``ssh`` variables of the mesh data set. The following new variables will be added to the data set: * ``minLevelCell`` - the index of the top valid layer * ``maxLevelCell`` - the index of the bottom valid layer * ``cellMask`` - a mask of where cells are valid * ``layerThickness`` - the thickness of each layer * ``restingThickness`` - the thickness of each layer stretched as if ``ssh = 0`` * ``zMid`` - the elevation of the midpoint of each layer So far, all supported coordinates make use of a 1D reference vertical grid. The following variables associated with that field are also added to the mesh: * ``refTopDepth`` - the positive-down depth of the top of each ref. level * ``refZMid`` - the positive-down depth of the middle of each ref. level * ``refBottomDepth`` - the positive-down depth of the bottom of each ref. level * ``refInterfaces`` - the positive-down depth of the interfaces between ref. levels (with ``nVertLevels`` + 1 elements). * ``vertCoordMovementWeights`` - the weights (all ones) for coordinate movement There is considerable redundancy between these variables but each is sometimes convenient. Parameters ---------- config : configparser.ConfigParser Configuration options with parameters used to construct the vertical grid ds : xarray.Dataset A data set containing ``bottomDepth`` and ``ssh`` variables used to construct the vertical coordinate """ add_1d_grid(config, ds) ds['vertCoordMovementWeights'] = xarray.ones_like(ds.refBottomDepth) ds['minLevelCell'], ds['maxLevelCell'], ds['cellMask'] = \ compute_min_max_level_cell(ds.refTopDepth, ds.refBottomDepth, ds.ssh, ds.bottomDepth) ds['bottomDepth'], ds['maxLevelCell'] = alter_bottom_depth( config, ds.bottomDepth, ds.refBottomDepth, ds.maxLevelCell) ds['ssh'], ds['minLevelCell'] = alter_ssh(config, ds.ssh, ds.refBottomDepth, ds.minLevelCell) ds['layerThickness'] = compute_z_level_layer_thickness( ds.refTopDepth, ds.refBottomDepth, ds.ssh, ds.bottomDepth, ds.minLevelCell, ds.maxLevelCell) ds['restingThickness'] = compute_z_level_resting_thickness( ds.layerThickness, ds.ssh, ds.bottomDepth, ds.minLevelCell, ds.maxLevelCell)
def broadcast_lonlat(ds): ds.coords['lon'] = ds['lon'] * xr.ones_like(ds['lat']) ds.coords['lat'] = xr.ones_like(ds['lon']) * ds['lat'] return ds
def test_replace_x_y_nominal_lat_lon(dask, nans): x = np.linspace(0, 720, 10) y = np.linspace(-200, 140, 5) lon = xr.DataArray(np.linspace(0, 360, len(x)), coords=[("x", x)]) lat = xr.DataArray(np.linspace(-90, 90, len(y)), coords=[("y", y)]) llon = lon * xr.ones_like(lat) llat = xr.ones_like(lon) * lat data = np.random.rand(len(x), len(y)) ds = xr.DataArray(data, coords=[("x", x), ("y", y)]).to_dataset(name="data") ds.coords["lon"] = llon ds.coords["lat"] = llat if nans: lon = ds["lon"].load().data lon[0, :] = np.nan lon[-1, :] = np.nan lon[:, 0] = np.nan lon[:, -1] = np.nan lon[15:23, 23:26] = np.nan ds["lon"].data = lon # for lats put only some nans in the middle. # I currently have no way to interpolate lats at the edge. lat = ds["lat"].load().data lat[15:23, 23:26] = np.nan ds["lat"].data = lat if dask: ds = ds.chunk({"x": -1, "y": -1}) ds.coords["lon"] = ds.coords["lon"].chunk({"x": -1, "y": -1}) ds.coords["lat"] = ds.coords["lat"].chunk({"x": -1, "y": -1}) replaced_ds = replace_x_y_nominal_lat_lon(ds) assert all(~np.isnan(replaced_ds.x)) assert all(~np.isnan(replaced_ds.y)) assert all(replaced_ds.x.diff("x") > 0) assert all(replaced_ds.y.diff("y") > 0) assert len(replaced_ds.lon.shape) == 2 assert len(replaced_ds.lat.shape) == 2 assert set(replaced_ds.lon.dims) == set(["x", "y"]) assert set(replaced_ds.lat.dims) == set(["x", "y"]) assert all(~np.isnan(replaced_ds.x)) assert all(~np.isnan(replaced_ds.y)) # test a dataset that would result in duplicates with current method x = np.linspace(0, 720, 4) y = np.linspace(-200, 140, 3) llon = xr.DataArray( np.array([[0, 50, 100, 150], [0, 50, 100, 150], [0, 50, 100, 150]]), coords=[("y", y), ("x", x)], ) llat = xr.DataArray( np.array([[0, 0, 10, 0], [10, 0, 0, 0], [20, 20, 20, 20]]), coords=[("y", y), ("x", x)], ) data = np.random.rand(len(x), len(y)) ds = xr.DataArray(data, coords=[("x", x), ("y", y)]).to_dataset(name="data") ds.coords["lon"] = llon ds.coords["lat"] = llat if dask: ds = ds.chunk({"x": -1, "y": -1}) ds.coords["lon"] = ds.coords["lon"].chunk({"x": -1, "y": -1}) ds.coords["lat"] = ds.coords["lat"].chunk({"x": -1, "y": -1}) replaced_ds = replace_x_y_nominal_lat_lon(ds) assert all(~np.isnan(replaced_ds.x)) assert all(~np.isnan(replaced_ds.y)) assert len(replaced_ds.y) == len(np.unique(replaced_ds.y)) assert len(replaced_ds.x) == len(np.unique(replaced_ds.x)) # make sure values are sorted in ascending order assert all(replaced_ds.x.diff("x") > 0) assert all(replaced_ds.y.diff("y") > 0) assert len(replaced_ds.lon.shape) == 2 assert len(replaced_ds.lat.shape) == 2 assert set(replaced_ds.lon.dims) == set(["x", "y"]) assert set(replaced_ds.lat.dims) == set(["x", "y"])
def _preprocess_single( self, shp_filepath: Path, reference_nc_filepath: Path, var_name: str, lookup_colname: str, save: bool = True, ) -> Optional[xr.Dataset]: """ Preprocess .shp admin boundary files into an `.nc` file with the same shape as reference_nc_filepath. Will create categorical .nc file which will specify which admin region each pixel is in. Arguments ---------- shp_filepath: Path The path to the shapefile reference_nc_filepath: Path The path to the netcdf file with the shape (must have been run through Preprocessors prior to using) var_name: str the name of the Variable in the xr.Dataset and the name of the output filename - {var_name}_{self.country}.nc lookup_colname: str the column name to lookup in the shapefile (read in as geopandas.GeoDataFrame) """ filename = self.get_filename(var_name) if (self.out_dir / filename).exists(): print("** Data already preprocessed! **\nIf you need to " "process again then move or delete existing file" f" at: {(self.out_dir / filename).as_posix()}") return None assert "interim" in reference_nc_filepath.parts, ( "Expected " "the target data to have been preprocessed by the pipeline") # MUST have a target dataset to create the same shape target_ds = xr.ones_like(xr.open_dataset(reference_nc_filepath)) data_var = [d for d in target_ds.data_vars][0] da = target_ds[data_var] # turn the shapefile into a categorical variable (like landcover) shp_to_nc = SHPtoXarray() ds = shp_to_nc.shapefile_to_xarray( da=da, shp_path=shp_filepath, var_name=var_name, lookup_colname=lookup_colname, ) # save the data if save: print(f"Saving to {self.out_dir}") if self.analysis is True: assert self.out_dir.parts[-2] == "analysis", ( "self.analysis should" "be True and the output directory should be analysis") ds.to_netcdf(self.out_dir / filename) print(f"** {(self.out_dir / filename).as_posix()} saved! **") return None else: return ds
def init(ds): return xr.ones_like(ds.isel(time=0))
def broadcast_lonlat(ds): ds.coords["lon"] = ds["lon"] * xr.ones_like(ds["lat"]) ds.coords["lat"] = xr.ones_like(ds["lon"]) * ds["lat"] return ds
def compute_diagnostics( state: State, tendency: State, label: str, hydrostatic: bool ) -> Diagnostics: delp = state[DELP] temperature_tendency_name = "dQ1" humidity_tendency_name = "dQ2" temperature_tendency = tendency.get(temperature_tendency_name, xr.zeros_like(delp)) humidity_tendency = tendency.get(humidity_tendency_name, xr.zeros_like(delp)) # compute column-integrated diagnostics if hydrostatic: net_heating = vcm.column_integrated_heating_from_isobaric_transition( temperature_tendency, delp, "z" ) else: net_heating = vcm.column_integrated_heating_from_isochoric_transition( temperature_tendency, delp, "z" ) diags: Diagnostics = { f"net_moistening_due_to_{label}": vcm.mass_integrate( humidity_tendency, delp, dim="z" ).assign_attrs( units="kg/m^2/s", description=f"column integrated moisture tendency due to {label}", ), f"column_heating_due_to_{label}": net_heating.assign_attrs( units="W/m^2" ).assign_attrs(description=f"column integrated heating due to {label}"), } delp_tendency = STATE_NAME_TO_TENDENCY[DELP] if delp_tendency in tendency: net_mass_tendency = vcm.mass_integrate( xr.ones_like(tendency[delp_tendency]), tendency[delp_tendency], dim="z" ).assign_attrs( units="kg/m^2/s", description=f"column-integrated mass tendency due to {label}", ) diags[f"net_mass_tendency_due_to_{label}"] = net_mass_tendency # add 3D tendencies to diagnostics if label == "nudging": diags_3d: Mapping[Hashable, xr.DataArray] = { f"{TENDENCY_TO_STATE_NAME[k]}_tendency_due_to_nudging": v for k, v in tendency.items() } elif label == "machine_learning": diags_3d = { "dQ1": temperature_tendency.assign_attrs(units="K/s").assign_attrs( description=f"air temperature tendency due to {label}" ), "dQ2": humidity_tendency.assign_attrs(units="kg/kg/s").assign_attrs( description=f"specific humidity tendency due to {label}" ), } diags.update(diags_3d) # add 3D state to diagnostics for backwards compatibility diags.update({TEMP: state[TEMP], SPHUM: state[SPHUM], DELP: state[DELP]}) return diags
def get_section_trsp(fldx, fldy, grid, left, right, nx=100, is_normal=True): """ Interpolate a vector field to a section line, returning the normal component Note: DIRECTION NEEDS TO BE VERIFIED! Parameters ---------- fldx, fldy : xarray DataArray Containing vector field to grab along section left, right : tuple or list of 2 floats Containing lon/lat bounding points nx : int, optional Number of interpolation points Returns ------- q : xarray DataArray with interpolated vector field into section, dimension i and xc/yc as lon/lat along section, dim i """ # Create x/y coords for line x = np.linspace(left[0], right[0], nx + 1) y = np.linspace(left[1], right[1], nx + 1) # interp to mid point # create an index variable: i # interpolated result will live along this coordinate xc = xr.DataArray(_mov_avg(x), dims='i') yc = xr.DataArray(_mov_avg(y), dims='i') # Look for a mask for valid points maskW = fldx.maskW if 'maskW' in fldx.coords else True * xr.ones_like(fldx) maskS = fldy.maskS if 'maskS' in fldy.coords else True * xr.ones_like(fldy) # interpolate U and V to this point uvel = grid.interp(fldx.where(maskW, np.NAN), 'X', boundary='fill').interp(XC=xc, YC=yc) vvel = grid.interp(fldy.where(maskS, np.NAN), 'Y', boundary='fill').interp(XC=xc, YC=yc) # get coordinate system tangent and normal to this line dxc = xr.DataArray(np.diff(x), dims='i') dyc = xr.DataArray(np.diff(y), dims='i') sin = dyc / np.sqrt(dxc**2 + dyc**2) cos = dxc / np.sqrt(dxc**2 + dyc**2) # This rotation uses a negative angle rotation: # https://en.wikipedia.org/wiki/Rotation_matrix#Direction # consider purely zonal flow: (u,v) = (1,0) # and a low angle rotation, say theta=15 # i.e. xaxis from "->" to "/^", but less dramatic than I can draw here # then in the new coordinate system, this flow will # be mostly positive in the new zonal direction (close to one) # but the new v component will be slightly negative if is_normal: q = -sin * uvel + cos * vvel else: q = cos * uvel + sin * vvel myname = fldx.name[:-1] #drop the W,S q.name = myname # add xc,yc q = q.to_dataset() q['xc'] = xc.copy() q['yc'] = yc.copy() q = q.set_coords('xc') q = q.set_coords('yc') return q[myname]
def _tseries_gen(varname, component, ensemble, entries, cluster_in): """ generate a tseries for a particular ensemble member, return a Dataset object """ print_timestamp(f"varname={varname}") varname_resolved = _varname_resolved(varname, component) fnames = entries.loc[entries["ensemble"] == ensemble].files.tolist() print(fnames) with open(var_specs_fname, mode="r") as fptr: var_specs_all = yaml.safe_load(fptr) if varname in var_specs_all[component]["vars"]: var_spec = var_specs_all[component]["vars"][varname] else: var_spec = {} # use var specific reduce_dims if it exists, otherwise use reduce_dims for component if "reduce_dims" in var_spec: reduce_dims = var_spec["reduce_dims"] else: reduce_dims = var_specs_all[component]["reduce_dims"] # get rank of varname from first file, used to set time chunksize # approximate number of time levels, assuming all files have same number # save time encoding from first file, to restore it in the multi-file case # https://github.com/pydata/xarray/issues/2921 with xr.open_dataset(fnames[0]) as ds0: vardims = ds0[varname_resolved].dims rank = len(vardims) vertlen = ds0.dims[vardims[1]] if rank > 3 else 0 time_chunksize = 10 * 12 if rank < 4 else 6 ds0.chunk(chunks={time_name: time_chunksize}) time_encoding = ds0[time_name].encoding var_encoding = ds0[varname_resolved].encoding ds0_attrs = ds0.attrs ds0_encoding = ds0.encoding drop_var_names_loc = drop_var_names(component, ds0, varname_resolved) # instantiate cluster, if not provided via argument # ignore dashboard warnings when instantiating if cluster_in is None: if "ncar_jobqueue" in sys.modules: with warnings.catch_warnings(): warnings.filterwarnings(action="ignore", module=".*dashboard") cluster = ncar_jobqueue.NCARCluster() else: raise ValueError( "cluster_in not provided and ncar_jobqueue did not load successfully" ) else: cluster = cluster_in workers = 12 if vertlen >= 20: workers *= 2 if vertlen >= 60: workers *= 2 workers = 2 * round(workers / 2) # round to nearest multiple of 2 print_timestamp(f"calling cluster.scale({workers})") cluster.scale(workers) print_timestamp(f"dashboard_link={cluster.dashboard_link}") # create dask distributed client, connecting to workers with dask.distributed.Client(cluster) as client: print_timestamp("client instantiated") # tool to help track down file inconsistencies that trigger errors in open_mfdataset # test_open_mfdataset(fnames, time_chunksize, varname) # data_vars = "minimal", to avoid introducing time dimension to time-invariant fields when there are multiple files # only chunk in time, because if you chunk over spatial dims, then sum results depend on chunksize # https://github.com/pydata/xarray/issues/2902 with xr.open_mfdataset( fnames, data_vars="minimal", coords="minimal", compat="override", combine="by_coords", chunks={time_name: time_chunksize}, drop_variables=drop_var_names_loc, ) as ds_in: print_timestamp("open_mfdataset returned") # restore encoding for time from first file ds_in[time_name].encoding = time_encoding da_in_full = ds_in[varname_resolved] da_in_full.encoding = var_encoding var_units = clean_units(da_in_full.attrs["units"]) if "unit_conv" in var_spec: var_units = f"({var_spec['unit_conv']})({var_units})" # construct averaging/integrating weight weight = get_weight(ds_in, component, reduce_dims) weight_attrs = weight.attrs weight = get_rmask(ds_in, component) * weight weight.attrs = weight_attrs print_timestamp("weight constructed") # compute regional sum of weights da_in_t0 = da_in_full.isel({time_name: 0}).drop(time_name) ones_masked_t0 = xr.ones_like(da_in_t0).where(da_in_t0.notnull()) weight_sum = (ones_masked_t0 * weight).sum(dim=reduce_dims) weight_sum.name = f"weight_sum_{varname}" weight_sum.attrs = weight.attrs weight_sum.attrs[ "long_name" ] = f"sum of weights used in tseries generation for {varname}" tlen = da_in_full.sizes[time_name] print_timestamp(f"tlen={tlen}") # use var specific tseries_op if it exists, otherwise use tseries_op for component if "tseries_op" in var_spec: tseries_op = var_spec["tseries_op"] else: tseries_op = var_specs_all[component]["tseries_op"] ds_out_list = [] time_step_nominal = min(2 * workers * time_chunksize, tlen) time_step = math.ceil(tlen / (tlen // time_step_nominal)) print_timestamp(f"time_step={time_step}") for time_ind0 in range(0, tlen, time_step): print_timestamp(f"time_ind={time_ind0}, {time_ind0 + time_step}") da_in = da_in_full.isel( {time_name: slice(time_ind0, time_ind0 + time_step)} ) if tseries_op == "integrate": da_out = (da_in * weight).sum(dim=reduce_dims) da_out.name = varname da_out.attrs["long_name"] = "Integrated " + da_in.attrs["long_name"] da_out.attrs["units"] = cf_units.Unit( f"({weight.attrs['units']})({var_units})" ).format() elif tseries_op == "average": da_out = (da_in * weight).sum(dim=reduce_dims) ones_masked = xr.ones_like(da_in).where(da_in.notnull()) denom = (ones_masked * weight).sum(dim=reduce_dims) da_out /= denom da_out.name = varname da_out.attrs["long_name"] = "Averaged " + da_in.attrs["long_name"] da_out.attrs["units"] = cf_units.Unit(var_units).format() else: msg = f"tseries_op={tseries_op} not implemented" raise NotImplementedError(msg) print_timestamp("da_out computation setup") # propagate some settings from da_in to da_out da_out.encoding["dtype"] = da_in.encoding["dtype"] copy_fill_settings(da_in, da_out) ds_out = da_out.to_dataset() print_timestamp("ds_out generated") # copy particular variables from ds_in copy_var_list = [time_name] if "bounds" in ds_in[time_name].attrs: copy_var_list.append(ds_in[time_name].attrs["bounds"]) copy_var_list.extend(copy_var_names(component)) ds_out = xr.merge( [ ds_out, ds_in[copy_var_list].isel( {time_name: slice(time_ind0, time_ind0 + time_step)} ), ] ) print_timestamp("copy_var_names added") # force computation of ds_out, while resources of client are still available print_timestamp("calling ds_out.load") ds_out_list.append(ds_out.load()) print_timestamp("returned from ds_out.load") print_timestamp("concatenating ds_out_list datasets") ds_out = xr.concat( ds_out_list, dim=time_name, data_vars=[varname], coords="minimal", compat="override", ) # set ds_out.time to mid-interval values ds_out = time_set_mid(ds_out, time_name) print_timestamp("time_set_mid returned") # copy file attributes ds_out.attrs = ds0_attrs datestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z") msg = f"{datestamp}: created by {__file__}" if "history" in ds_out.attrs: ds_out.attrs["history"] = "\n".join([msg, ds_out.attrs["history"]]) else: ds_out.attrs["history"] = msg ds_out.attrs["input_file_list"] = " ".join(fnames) for key in ["unlimited_dims"]: if key in ds0_encoding: ds_out.encoding[key] = ds0_encoding[key] # restore encoding for time from first file ds_out[time_name].encoding = time_encoding # change output units, if specified in var_spec units_key = ( "integral_display_units" if tseries_op == "integrate" else "display_units" ) if units_key in var_spec: ds_out[varname] = conv_units(ds_out[varname], var_spec[units_key]) print_timestamp("units converted") # add regional sum of weights ds_out[weight_sum.name] = weight_sum print_timestamp("ds_in and client closed") # if cluster was instantiated here, close it if cluster_in is None: cluster.close() return ds_out
def run(self): """ Run this step of the test case """ # create the mesh and graph.info super().run() config = self.config section = config['horizontal_advection'] temperature = section.getfloat('temperature') salinity = section.getfloat('salinity') x_center = 1e3*section.getfloat('x_center') y_center = 1e3*section.getfloat('y_center') advect_x = section.getboolean('advect_x') advect_y = section.getboolean('advect_y') gaussian_width = 1e3*section.getfloat('gaussian_width') section = config['planar_convergence'] duration = 3600.*section.getfloat('duration') dt_1km = section.getint('dt_1km') resolution = float(self.resolution) dt = dt_1km * resolution dc = resolution*1e3 ds = xarray.open_dataset('mesh.nc') xCell = ds.xCell yCell = ds.yCell bottom_depth = config.getfloat('vertical_grid', 'bottom_depth') ds['bottomDepth'] = bottom_depth * xarray.ones_like(xCell) ds['ssh'] = xarray.zeros_like(xCell) init_vertical_coord(config, ds) if advect_x: x_vel = ds.attrs['x_period']/duration x_cfl = x_vel*dt/dc print(f'x_cfl: {x_cfl}') else: x_vel = 0. if advect_y: y_vel = ds.attrs['y_period']/duration y_cfl = y_vel*dt/dc print(f'y_cfl: {y_cfl}') else: y_vel = 0. temperature = temperature*xarray.ones_like(xCell) temperature, _ = xarray.broadcast(temperature, ds.refBottomDepth) temperature = temperature.transpose('nCells', 'nVertLevels') temperature = temperature.expand_dims(dim='Time', axis=0) salinity = salinity*xarray.ones_like(temperature) angleEdge = ds.angleEdge normalVelocity = (numpy.cos(angleEdge) * x_vel + numpy.sin(angleEdge) * y_vel) normalVelocity, _ = xarray.broadcast(normalVelocity, ds.refBottomDepth) normalVelocity = normalVelocity.transpose('nEdges', 'nVertLevels') normalVelocity = normalVelocity.expand_dims(dim='Time', axis=0) dist_sq = (xCell - x_center)**2 + (yCell - y_center)**2 tracer1 = numpy.exp(-0.5*dist_sq/gaussian_width**2) tracer1, _ = xarray.broadcast(tracer1, ds.refBottomDepth) tracer1 = tracer1.transpose('nCells', 'nVertLevels') tracer1 = tracer1.expand_dims(dim='Time', axis=0) ds['temperature'] = temperature ds['salinity'] = salinity * xarray.ones_like(temperature) ds['normalVelocity'] = normalVelocity ds['fCell'] = xarray.zeros_like(xCell) ds['fEdge'] = xarray.zeros_like(ds.xEdge) ds['fVertex'] = xarray.zeros_like(ds.xVertex) ds['tracer1'] = tracer1 write_netcdf(ds, 'initial_state.nc')
import numpy as np import xarray as xr airds = xr.tutorial.open_dataset("air_temperature").isel(time=slice(4), lon=slice(50)) airds.air.attrs["cell_measures"] = "area: cell_area" airds.air.attrs["standard_name"] = "air_temperature" airds.coords["cell_area"] = (xr.DataArray(np.cos(airds.lat * np.pi / 180)) * xr.ones_like(airds.lon) * 105e3 * 110e3) ds_no_attrs = airds.copy(deep=True) for variable in ds_no_attrs.variables: ds_no_attrs[variable].attrs = {} popds = xr.Dataset() popds.coords["TLONG"] = ( ("nlat", "nlon"), np.ones((20, 30)), { "units": "degrees_east" }, ) popds.coords["TLAT"] = ( ("nlat", "nlon"), 2 * np.ones((20, 30)), { "units": "degrees_north" }, ) popds.coords["ULONG"] = ( ("nlat", "nlon"),
def change_significance( fut: Union[xr.DataArray, xr.Dataset], ref: Union[xr.DataArray, xr.Dataset] = None, test: str = "ttest", **kwargs, ) -> Tuple[Union[xr.DataArray, xr.Dataset], Union[xr.DataArray, xr.Dataset]]: """Robustness statistics qualifying how the members of an ensemble agree on the existence of change and on its sign. Parameters ---------- fut : Union[xr.DataArray, xr.Dataset] Future period values along 'realization' and 'time' (..., nr, nt1) or if `ref` is None, Delta values along `realization` (..., nr). ref : Union[xr.DataArray, xr.Dataset], optional Reference period values along realization' and 'time' (..., nt2, nr). The size of the 'time' axis does not need to match the one of `fut`. But their 'realization' axes must be identical. If `None` (default), values of `fut` are assumed to be deltas instead of a distribution across the future period. `fut` and `ref` must be of the same type (Dataset or DataArray). If they are Dataset, they must have the same variables (name and coords). test : {'ttest', 'welch-ttest', 'threshold', None} Name of the statistical test used to determine if there was significant change. See notes. kwargs Other arguments specific to the statistical test. For 'ttest' and 'welch-ttest': p_change : float (default : 0.05) p-value threshold for rejecting the hypothesis of no significant change. For 'threshold': (Only one of those must be given.) abs_thresh : float (no default) Threshold for the (absolute) change to be considered significative. rel_thresh : float (no default, in [0, 1]) Threshold for the relative change (in reference to ref) to be significative. Only valid if `ref` is given. Returns ------- change_frac The fraction of members that show significant change [0, 1]. Passing `test=None` yields change_frac = 1 everywhere. Same type as `fut`. pos_frac The fraction of members showing significant change that show a positive change ]0, 1]. Null values are returned where no members show significant change. The table below shows the coefficient needed to retrieve the number of members that have the indicated characteristics, by multiplying it to the total number of members (`fut.realization.size`). +-----------------+------------------------------+------------------------+ | | Significant change | Non-significant change | +-----------------+------------------------------+------------------------+ | Any direction | change_frac | 1 - change_frac | +-----------------+------------------------------+------------------------+ | Positive change | pos_frac * change_frac | N.A. | +-----------------+------------------------------+ | | Negative change | (1 - pos_frac) * change_frac | | +-----------------+------------------------------+------------------------+ Notes ----- Available statistical tests are : 'ttest' : Single sample T-test. Same test as used by [tebaldi2011]_. The future values are compared against the reference mean (over 'time'). Change is qualified as 'significant' when the test's p-value is below the user-provided `p_change` value. 'welch-ttest' : Two-sided T-test, without assuming equal population variance. Same significance criterion as 'ttest'. 'threshold' : Change is considered significative if the absolute delta exceeds a given threshold (absolute or relative). None : Significant change is not tested and, thus, members showing no change are included in the `sign_frac` output. References ---------- .. [tebaldi2011] Tebaldi C., Arblaster, J.M. and Knutti, R. (2011) Mapping model agreement on future climate projections. GRL. doi:10.1029/2011GL049863 Example ------- This example computes the mean temperature in an ensemble and compares two time periods, qualifying significant change through a single sample T-test. >>> from xclim import ensembles >>> ens = ensembles.create_ensemble(temperature_datasets) >>> tgmean = xclim.atmos.tg_mean(tas=ens.tas, freq='YS') >>> fut = tgmean.sel(time=slice('2020', '2050')) >>> ref = tgmean.sel(time=slice('1990', '2020')) >>> chng_f, pos_f = ensembles.change_significance(fut, ref, test='ttest') If the deltas were already computed beforehand, the 'threshold' test can still be used, here with a 2 K threshold. >>> delta = fut.mean('time') - ref.mean('time') >>> chng_f, pos_f = ensembles.change_significance(delta, test='threshold', abs_thresh=2) """ test_params = { "ttest": ["p_change"], "welch-ttest": ["p_change"], "threshold": ["abs_thresh", "rel_thresh"], } changed = None if ref is None: delta = fut n_valid_real = delta.notnull().sum("realization") if test not in ["threshold", None]: raise ValueError( "When deltas are given (ref=None), 'test' must be one of ['threshold', None]" ) else: delta = fut.mean("time") - ref.mean("time") n_valid_real = fut.notnull().all("time").sum("realization") if test == "ttest": p_change = kwargs.setdefault("p_change", 0.05) # Test hypothesis of no significant change pvals = xr.apply_ufunc( lambda f, r: spstats.ttest_1samp(f, r, axis=-1, nan_policy="omit")[ 1], fut, ref.mean("time"), input_core_dims=[["realization", "time"], ["realization"]], output_core_dims=[["realization"]], vectorize=True, dask="parallelized", output_dtypes=[float], ) # When p < p_change, the hypothesis of no significant change is rejected. changed = pvals < p_change elif test == "welch-ttest": p_change = kwargs.setdefault("p_change", 0.05) # Test hypothesis of no significant change # equal_var=False -> Welch's T-test pvals = xr.apply_ufunc( lambda f, r: spstats.ttest_ind( f, r, axis=-1, equal_var=False, nan_policy="omit")[1], fut, ref, input_core_dims=[["realization", "time"], ["realization", "time"]], output_core_dims=[["realization"]], exclude_dims={"time"}, vectorize=True, dask="parallelized", output_dtypes=[float], ) # When p < p_change, the hypothesis of no significant change is rejected. changed = pvals < p_change elif test == "threshold": if "abs_thresh" in kwargs and "rel_thresh" not in kwargs: changed = abs(delta) > kwargs["abs_thresh"] elif "rel_thresh" in kwargs and "abs_thresh" not in kwargs and ref is not None: changed = abs(delta / ref.mean("time")) > kwargs["rel_thresh"] else: raise ValueError( "Invalid argument combination for test='threshold'.") elif test is not None: raise ValueError( f"Statistical test {test} must be one of {', '.join(test_params.keys())}." ) if test is not None: delta_chng = delta.where(changed) change_frac = changed.sum("realization") / n_valid_real else: delta_chng = delta change_frac = xr.ones_like(delta.isel(realization=0)) # Test that models agree on the sign of the change # This returns NaN (cause 0 / 0) where no model show significant change. pos_frac = (delta_chng > 0).sum("realization") / (change_frac * n_valid_real) # Metadata kwargs_str = ", ".join( [f"{k}: {v}" for k, v in kwargs.items() if k in test_params[test]]) test_str = ( f"Significant change was tested with test {test} with parameters {kwargs_str}." ) das = {"fut": fut} if ref is None else {"fut": fut, "ref": ref} pos_frac.attrs.update( description= "Fraction of members showing significant change that agree on a positive change. " + test_str, units="", test=str(test), history=update_history( f"pos_frac from change_significance(fut=fut, ref=ref, test={test}, {kwargs_str})", **das, ), ) change_frac.attrs.update( description="Fraction of members showing significant change. " + test_str, units="", test=str(test), history=update_history( f"change_frac from change_significance(fut=fut, ref=ref, test={test}, {kwargs_str})", **das, ), ) return change_frac, pos_frac
def truncate_dataarray(dataarray, quantile_dims, replace_with_mean=False, mean_dims=None, weights=None, quantiles=None, extra_dim=None): r"""Truncates the dataarray over the given dimensions, meaning that data outside the upper and lower quantiles, which are taken across the dimensions ``quantile_dims``, are replaced either with: 1. the upper and lower quantiles themselves. 2. or with the mean of the in-lier data, which is taken across the dimensions given by ``mean_dims``. **Note**: If weights are given, then weighted-quantiles and weighted-means are taken, otherwise the quantiles and means are unweighted. Args: dataarray (xarray.DataArray): dataarray that has at least the dimensions given by ``dims``, and if ``replace_with_mean`` is True, then also ``mean_dims``. replace_with_mean (bool, optional): If True, then replace values outside of the upper and lower quantiles and with the mean across the dimensions given by `mean_dims`, if False, then replace with the upper and lower bounds themselves. mean_dims (list[str], optional): dimensions to take mean within the bounds over quantile_dims (list[str]): dimensions to take quantiles over -- the quantiles are used to make the bounds. weights (xarray.DataArray, optional): Must have one dimension and can have up two dimensions. quantiles (tuple[float, float] | list[float, float], optional): The tuple of two floats representing the quantiles to take. extra_dim (str): Extra dimension that exists in `weights` and `data`. It should not be in `stat_dims`. Returns: (xarray.DataArray): Same shape as the original array, but with truncated values. Raises: (ValueError): If `replace_with_mean` is True, and `mean_dims` is not list of strings. """ LOGGER.debug("Entering the `truncate_dataarray` function") LOGGER.debug("quantile_dims:{}".format(quantile_dims)) LOGGER.debug("replace_with_mean:{}".format(replace_with_mean)) LOGGER.debug("mean_dims:{}".format(mean_dims)) LOGGER.debug("weights:{}".format(weights)) LOGGER.debug("quantiles:{}".format(quantiles)) LOGGER.debug("extra_dim:{}".format(extra_dim)) if replace_with_mean and not mean_dims: mean_dims_err_msg = ( "If `replace_with_mean` is True, then `mean_dims` " "must be a list of strings") LOGGER.error(mean_dims_err_msg) raise ValueError(mean_dims_err_msg) else: pass # `mean_dims` doesn't can be None quantiles = (Quantiles( *sorted(quantiles)) if quantiles else Quantiles(0.05, 0.95)) if weights is not None: quantile_values = weighted_quantile_with_extra_dim( dataarray, quantiles, list(quantile_dims), weights, extra_dim) else: quantile_values = dataarray.quantile(quantiles, dim=list(quantile_dims)) lower_da = quantile_values.sel(quantile=quantiles.lower) upper_da = quantile_values.sel(quantile=quantiles.upper) if replace_with_mean: good_indexes = (dataarray >= lower_da) & (dataarray <= upper_da) inside_da = dataarray.where(good_indexes) outside_da = dataarray.where(~good_indexes) if weights is not None: inside_mean_da = weighted_mean_with_extra_dim( inside_da, mean_dims, weights, extra_dim) else: inside_mean_da = inside_da.mean(mean_dims) truncated_da = (inside_da.combine_first( xr.ones_like(outside_da) * inside_mean_da)) else: expanded_lower_da, _ = xr.broadcast(lower_da, dataarray) expanded_lower_da = expanded_lower_da.transpose(*dataarray.coords.dims) expanded_upper_da, _ = xr.broadcast(upper_da, dataarray) expanded_upper_da = expanded_upper_da.transpose(*dataarray.coords.dims) truncated_da = dataarray.clip(min=expanded_lower_da, max=expanded_upper_da) LOGGER.debug("Leaving the `truncate_dataarray` function") return truncated_da
def region(xds, name='region1', ra=None, dec=None, pixels=None, pol=-1, channels=-1): """ Create a new region Data variable in the Dataset \n .. note:: This function currently only supports rectangles and integer pixel boundaries Parameters ---------- xds : xarray.core.dataset.Dataset input image dataset name : str dataset variable name for region, overwrites if already present ra : list right ascension coordinate range in the form of [min, max]. Default None means all dec : list declination coordinate range in the form of [min, max]. Default None means all pixels : array_like array of shape (N,2) containing pixel box. OR'd with ra/dec pol : int or list polarization dimension(s) to include in region. Default of -1 means all channels : int or list channel dimension(s) to include in region. Default of -1 means all Returns ------- xarray.core.dataset.Dataset New Dataset """ import numpy as np import dask.array as da import xarray as xr # type checking/conversion if not name.strip(): name = 'regionX' if ra is None: ra = [0.0, 0.0] if dec is None: dec = [0.0, 0.0] if pixels is None: pixels = np.zeros((1, 2), dtype=int) - 1 pixels = np.array(pixels, dtype=int) if (pixels.ndim != 2) or (pixels.shape[1] != 2): print('ERROR: pixels parameter not a (N,2) array') return None pol = np.array(np.atleast_1d(pol), dtype=int) if pol[0] == -1: pol = list(range(len(xds['pol']))) channels = np.array(np.atleast_1d(channels), dtype=int) if channels[0] == -1: channels = list(range(len(xds['chan']))) # TBD: allow arbitrary pixels, not just rectangles #ind_x = xr.DataArray(list(pixels[:,0]), dims=['d0']) #ind_y = xr.DataArray(list(pixels[:,1]), dims=['d1']) #region = xds.image[ind_x, ind_y] # TESTING only # ra = [2.88788, 2.88793] # dec = [-0.60573, -0.60568] # pixels = np.array([[20,40],[80,500]]) # define region within ra/dec range region = xr.ones_like(xds.image, dtype=bool).where( (xds.right_ascension > np.min(ra)) & (xds.right_ascension < np.max(ra)) & (xds.declination > np.min(dec)) & (xds.declination < np.max(dec)), False) # OR pixel values with ra/dec values #region = region | xr.ones_like(xds.image,dtype=bool).where(xds.d0.isin(pixels[:,0]) & # xds.d1.isin(pixels[:,1]), False) region = region | xr.ones_like(xds.image, dtype=bool).where( (xds.d0 > np.min(pixels[:, 0])) & (xds.d0 < np.max(pixels[:, 0])) & (xds.d1 > np.min(pixels[:, 1])) & (xds.d1 < np.max(pixels[:, 1])), False) # apply polarization and channels selections region = region.where(xds.pol.isin(xds.pol[pol]), False) region = region.where(xds.chan.isin(xds.chan[channels]), False) # assign region to a rest of image dataset xds = xds.assign(dict([(name, region)])) return xds
def cutout( od, varList=None, YRange=None, XRange=None, add_Hbdr=False, mask_outside=False, ZRange=None, add_Vbdr=False, timeRange=None, timeFreq=None, sampMethod="snapshot", dropAxes=False, transformation=False, centered="Atlantic", ): """ Cutout the original dataset in space and time preserving the original grid structure. Parameters ---------- od: OceanDataset oceandataset to subsample varList: 1D array_like, str, or None List of variables (strings). YRange: 1D array_like, scalar, or None Y axis limits (e.g., latitudes). If len(YRange)>2, max and min values are used. XRange: 1D array_like, scalar, or None X axis limits (e.g., longitudes). If len(XRange)>2, max and min values are used. add_Hbdr: bool, scal If scalar, add and subtract `add_Hbdr` to the the horizontal range. of the horizontal ranges. If True, automatically estimate add_Hbdr. If False, add_Hbdr is set to zero. mask_outside: bool If True, set all values in areas outside specified (Y,X)ranges to NaNs. (Useful for curvilinear grids). ZRange: 1D array_like, scalar, or None Z axis limits. If len(ZRange)>2, max and min values are used. add_Vbdr: bool, scal If scalar, add and subtract `add_Vbdr` to the the vertical range. If True, automatically estimate add_Vbdr. If False, add_Vbdr is set to zero. timeRange: 1D array_like, numpy.ScalarType, or None time axis limits. If len(timeRange)>2, max and min values are used. timeFreq: str or None Time frequency. Available optionts are pandas Offset Aliases (e.g., '6H'): http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases sampMethod: {'snapshot', 'mean'} Downsampling method (only if timeFreq is not None). dropAxes: 1D array_like, str, or bool List of axes to remove from Grid object. if one point only is in the range. If True, set dropAxes=od.grid_coords. If False, preserve original grid. transformation: str, or bool Lists the transformation of the llcgrid into a new one in which face is no longer a dimension. Default is `False`. If `True`, need to define how data will be centered centered: str, or bool default is `Atlantic`, and other options is `Pacific`. This refers to which ocean appears centered on the data. Returns ------- od: OceanDataset Subsampled oceandataset Notes ----- If any of the horizontal ranges is not None, the horizontal dimensions of the cutout will have len(Xp1)>len(X) and len(Yp1)>len(Y) even if the original oceandataset had len(Xp1)==len(X) or len(Yp1)==len(Y). """ # Checks unsupported_dims = ["mooring", "particle", "station"] check1 = XRange is not None or YRange is not None if check1 and any([dim in unsupported_dims for dim in od._ds.dims]): _warnings.warn( "\nHorizontal cutout not supported" "for moorings, surveys, and particles", stacklevel=2, ) XRange = None YRange = None _check_instance( { "od": od, "add_Hbdr": add_Hbdr, "mask_outside": mask_outside, "timeFreq": timeFreq, }, { "od": "oceanspy.OceanDataset", "add_Hbdr": "(float, int, bool)", "mask_outside": "bool", "timeFreq": ["type(None)", "str"], }, ) varList = _check_list_of_string(varList, "varList") YRange = _check_range(od, YRange, "YRange") XRange = _check_range(od, XRange, "XRange") ZRange = _check_range(od, ZRange, "ZRange") timeRange = _check_range(od, timeRange, "timeRange") sampMethod_list = ["snapshot", "mean"] if sampMethod not in sampMethod_list: raise ValueError("`sampMethod` [{}] is not supported." "\nAvailable options: {}" "".format(sampMethod, sampMethod_list)) if not isinstance(dropAxes, bool): dropAxes = _check_list_of_string(dropAxes, "dropAxes") axes_warn = [axis for axis in dropAxes if axis not in od.grid_coords] if len(axes_warn) != 0: _warnings.warn( "\n{} are not axes of the oceandataset" "".format(axes_warn), stacklevel=2, ) dropAxes = list(set(dropAxes) - set(axes_warn)) dropAxes = {d: od.grid_coords[d] for d in dropAxes} elif dropAxes is True: dropAxes = od.grid_coords if YRange is None: dropAxes.pop("Y", None) if XRange is None: dropAxes.pop("X", None) if ZRange is None: dropAxes.pop("Z", None) if timeRange is None: dropAxes.pop("time", None) else: dropAxes = {} # Message print("Cutting out the oceandataset.") # Copy od = _copy.copy(od) # list for coord variables co_list = [var for var in od._ds.coords if var not in od._ds.dims] # Drop variables if varList is not None: # Make sure it's a list varList = list(varList) varList = varList + co_list varList = _rename_aliased(od, varList) # Compute missing variables od = _compute._add_missing_variables(od, varList) # Drop useless nvarlist = [v for v in od._ds.data_vars if v not in varList] od._ds = od._ds.drop_vars(nvarlist) else: # this way, if applicable, llc_transf gets applied to all vars varList = [var for var in od._ds.reset_coords().data_vars] # Unpack ds = od._ds periodic = od.grid_periodic # --------------------------- # Time CUTOUT # --------------------------- # Initialize vertical mask maskT = _xr.ones_like(ds["time"]).astype("int") if timeRange is not None: # Use arrays timeRange = _np.asarray([_np.min(timeRange), _np.max(timeRange)]).astype(ds["time"].dtype) # Get the closest for i, time in enumerate(timeRange): if _np.issubdtype(ds["time"].dtype, _np.datetime64): diff = _np.fabs(ds["time"].astype("float64") - time.astype("float64")) else: diff = _np.fabs(ds["time"] - time) timeRange[i] = ds["time"].where(diff == diff.min(), drop=True).min().values maskT = maskT.where( _np.logical_and(ds["time"] >= timeRange[0], ds["time"] <= timeRange[-1]), 0) # Find time indexes maskT = maskT.assign_coords(time=_np.arange(len(maskT["time"]))) dmaskT = maskT.where(maskT, drop=True) dtime = dmaskT["time"].values iT = [min(dtime), max(dtime)] maskT["time"] = ds["time"] # Indexis if iT[0] == iT[1]: if "time" not in dropAxes: if iT[0] > 0: iT[0] = iT[0] - 1 else: iT[1] = iT[1] + 1 else: dropAxes.pop("time", None) # Cutout ds = ds.isel(time=slice(iT[0], iT[1] + 1)) if "time_midp" in ds.dims: if "time" in dropAxes: if iT[0] == len(ds["time_midp"]): iT[0] = iT[0] - 1 iT[1] = iT[1] - 1 ds = ds.isel(time_midp=slice(iT[0], iT[1] + 1)) else: ds = ds.isel(time_midp=slice(iT[0], iT[1])) # --------------------------- # Vertical CUTOUT # --------------------------- # Initialize vertical mask maskV = _xr.ones_like(ds["Zp1"]) if ZRange is not None: # Use arrays ZRange = _np.asarray( [_np.min(ZRange) - add_Vbdr, _np.max(ZRange) + add_Vbdr]) ZRange = ZRange.astype(ds["Zp1"].dtype) # Get the closest for i, Z in enumerate(ZRange): diff = _np.fabs(ds["Zp1"] - Z) ZRange[i] = ds["Zp1"].where(diff == diff.min()).min().values maskV = maskV.where( _np.logical_and(ds["Zp1"] >= ZRange[0], ds["Zp1"] <= ZRange[-1]), 0) # Find vertical indexes maskV = maskV.assign_coords(Zp1=_np.arange(len(maskV["Zp1"]))) dmaskV = maskV.where(maskV, drop=True) dZp1 = dmaskV["Zp1"].values iZ = [_np.min(dZp1), _np.max(dZp1)] maskV["Zp1"] = ds["Zp1"] # Indexis if iZ[0] == iZ[1]: if "Z" not in dropAxes: if iZ[0] > 0: iZ[0] = iZ[0] - 1 else: iZ[1] = iZ[1] + 1 else: dropAxes.pop("Z", None) # Cutout ds = ds.isel(Zp1=slice(iZ[0], iZ[1] + 1)) if "Z" in dropAxes: if iZ[0] == len(ds["Z"]): iZ[0] = iZ[0] - 1 iZ[1] = iZ[1] - 1 ds = ds.isel(Z=slice(iZ[0], iZ[1] + 1)) else: ds = ds.isel(Z=slice(iZ[0], iZ[1])) if len(ds["Zp1"]) == 1: if "Zu" in ds.dims and len(ds["Zu"]) > 1: ds = ds.sel(Zu=ds["Zp1"].values, method="nearest") if "Zl" in ds.dims and len(ds["Zl"]) > 1: ds = ds.sel(Zl=ds["Zp1"].values, method="nearest") else: if "Zu" in ds.dims and len(ds["Zu"]) > 1: ds = ds.isel(Zu=slice(iZ[0], iZ[1])) if "Zl" in ds.dims and len(ds["Zl"]) > 1: ds = ds.isel(Zl=slice(iZ[0], iZ[1])) # --------------------------- # Horizontal CUTOUT (part I, split into two to avoid repeated code) # --------------------------- if add_Hbdr is True: add_Hbdr = _np.mean([ _np.fabs(od._ds["XG"].max() - od._ds["XG"].min()), _np.fabs(od._ds["YG"].max() - od._ds["YG"].min()), ]) add_Hbdr = add_Hbdr / _np.mean([len(od._ds["X"]), len(od._ds["Y"])]) elif add_Hbdr is False: add_Hbdr = 0 if add_Vbdr is True: add_Vbdr = _np.fabs(od._ds["Zp1"].diff("Zp1")).max().values elif add_Vbdr is False: add_Vbdr = 0 # Initialize horizontal mask if XRange is not None or YRange is not None: maskH, dmaskH, XRange, YRange = get_maskH(ds, add_Hbdr, add_Vbdr, XRange, YRange) if transformation is not False and "face" in ds.dims: if XRange is None and YRange is None: faces = "all" else: faces = dmaskH["face"].values # gets faces that survives cutout _transf_list = ["arctic_crown", "arctic_centered"] if transformation in _transf_list: arg = { "ds": ds, "varlist": varList, # vars and grid coords to transform "centered": centered, "faces": faces, "drop": True, # required to calculate U-V grid points } if transformation == "arctic_crown": _transformation = _llc_trans.arctic_crown elif transformation == "arctic_centered": _transformation = _llc_trans.arctic_centered dsnew = _transformation(**arg) dsnew = dsnew.set_coords(co_list) grid_coords = od.grid_coords od._ds = dsnew manipulate_coords = {"coordsUVfromG": True} od = od.manipulate_coords(**manipulate_coords) if len(grid_coords["time"]) > 1: grid_coords["time"].pop("time_midp", None) grid_coords = {"add_midp": True, "grid_coords": grid_coords} od = od.set_grid_coords(**grid_coords, overwrite=True) od._ds.attrs["OceanSpy_description"] = "Cutout of" "simulation, with simple topology (face not a dimension)" # Unpack the new dataset without face as dimension ds = od._ds maskH, dmaskH, XRange, YRange = get_maskH(ds, add_Hbdr, add_Vbdr, XRange, YRange) elif transformation not in _transf_list: raise ValueError("transformation not supported") elif transformation is False and "face" in ds.dims: raise ValueError("Must define a transformation to remove complex" "topology of dataset.") # --------------------------- # Horizontal CUTOUT part II (continuation of original code) # --------------------------- if XRange is not None or YRange is not None: dYp1 = dmaskH["Yp1"].values dXp1 = dmaskH["Xp1"].values iY = [_np.min(dYp1), _np.max(dYp1)] iX = [_np.min(dXp1), _np.max(dXp1)] maskH["Yp1"] = ds["Yp1"] maskH["Xp1"] = ds["Xp1"] # Original length lenY = len(ds["Yp1"]) lenX = len(ds["Xp1"]) # Indexis if iY[0] == iY[1]: if "Y" not in dropAxes: if iY[0] > 0: iY[0] = iY[0] - 1 else: iY[1] = iY[1] + 1 else: dropAxes.pop("Y", None) if iX[0] == iX[1]: if "X" not in dropAxes: if iX[0] > 0: iX[0] = iX[0] - 1 else: iX[1] = iX[1] + 1 else: dropAxes.pop("X", None) ds = ds.isel(Yp1=slice(iY[0], iY[1] + 1), Xp1=slice(iX[0], iX[1] + 1)) Xcoords = od._grid.axes["X"].coords if "X" in dropAxes: if iX[0] == len(ds["X"]): iX[0] = iX[0] - 1 iX[1] = iX[1] - 1 ds = ds.isel(X=slice(iX[0], iX[1] + 1)) elif ("outer" in Xcoords and Xcoords["outer"] == "Xp1") or ("left" in Xcoords and Xcoords["left"] == "Xp1"): ds = ds.isel(X=slice(iX[0], iX[1])) elif "right" in Xcoords and Xcoords["right"] == "Xp1": ds = ds.isel(X=slice(iX[0] + 1, iX[1] + 1)) Ycoords = od._grid.axes["Y"].coords if "Y" in dropAxes: if iY[0] == len(ds["Y"]): iY[0] = iY[0] - 1 iY[1] = iY[1] - 1 ds = ds.isel(Y=slice(iY[0], iY[1] + 1)) elif ("outer" in Ycoords and Ycoords["outer"] == "Yp1") or ("left" in Ycoords and Ycoords["left"] == "Yp1"): ds = ds.isel(Y=slice(iY[0], iY[1])) elif "right" in Ycoords and Ycoords["right"] == "Yp1": ds = ds.isel(Y=slice(iY[0] + 1, iY[1] + 1)) # Cut axis can't be periodic if (len(ds["Yp1"]) < lenY or "Y" in dropAxes) and "Y" in periodic: periodic.remove("Y") if (len(ds["Xp1"]) < lenX or "X" in dropAxes) and "X" in periodic: periodic.remove("X") # --------------------------- # Horizontal MASK # --------------------------- if mask_outside and (YRange is not None or XRange is not None): if YRange is not None: minY = YRange[0] maxY = YRange[1] else: minY = ds["YG"].min().values maxY = ds["YG"].max().values if XRange is not None: minX = XRange[0] maxX = XRange[1] else: minX = ds["XG"].min().values maxX = ds["XG"].max().values maskC = _xr.where( _np.logical_and( _np.logical_and(ds["YC"] >= minY, ds["YC"] <= maxY), _np.logical_and(ds["XC"] >= minX, ds["XC"] <= maxX), ), 1, 0, ).persist() maskG = _xr.where( _np.logical_and( _np.logical_and(ds["YG"] >= minY, ds["YG"] <= maxY), _np.logical_and(ds["XG"] >= minX, ds["XG"] <= maxX), ), 1, 0, ).persist() maskU = _xr.where( _np.logical_and( _np.logical_and(ds["YU"] >= minY, ds["YU"] <= maxY), _np.logical_and(ds["XU"] >= minX, ds["XU"] <= maxX), ), 1, 0, ).persist() maskV = _xr.where( _np.logical_and( _np.logical_and(ds["YV"] >= minY, ds["YV"] <= maxY), _np.logical_and(ds["XV"] >= minX, ds["XV"] <= maxX), ), 1, 0, ).persist() for var in ds.data_vars: if set(["X", "Y"]).issubset(ds[var].dims): ds[var] = ds[var].where(maskC, drop=True) elif set(["Xp1", "Yp1"]).issubset(ds[var].dims): ds[var] = ds[var].where(maskG, drop=True) elif set(["Xp1", "Y"]).issubset(ds[var].dims): ds[var] = ds[var].where(maskU, drop=True) elif set(["X", "Yp1"]).issubset(ds[var].dims): ds[var] = ds[var].where(maskV, drop=True) # --------------------------- # TIME RESAMPLING # --------------------------- # Resample in time if timeFreq: # Infer original frequency inFreq = _pd.infer_freq(ds.time.values) if timeFreq[0].isdigit() and not inFreq[0].isdigit(): inFreq = "1" + inFreq # Same frequency: Skip if timeFreq == inFreq: _warnings.warn( "\nInput time freq:" "[{}] = Output time frequency: [{}]:" "\nSkip time resampling." "".format(inFreq, timeFreq), stacklevel=2, ) else: # Remove time_midp and warn vars2drop = [ var for var in ds.variables if "time_midp" in ds[var].dims ] if vars2drop: _warnings.warn( "\nTime resampling drops variables" " on `time_midp` dimension." "\nDropped variables: {}." "".format(vars2drop), stacklevel=2, ) ds = ds.drop_vars(vars2drop) # Snapshot if sampMethod == "snapshot": # Find new times time2sel = ds["time"].resample(time=timeFreq).first() newtime = ds["time"].sel(time=time2sel) # Use slice when possible inds = [ i for i, t in enumerate(ds["time"].values) if t in newtime.values ] inds_diff = _np.diff(inds) if all(inds_diff == inds_diff[0]): ds = ds.isel(time=slice(inds[0], inds[-1] + 1, inds_diff[0])) else: attrs = ds.attrs ds = _xr.concat( [ds.sel(time=time) for i, time in enumerate(newtime)], dim="time", ) ds.attrs = attrs else: # Mean # Separate time and timeless attrs = ds.attrs ds_dims = ds.drop_vars( [var for var in ds.variables if var not in ds.dims]) ds_time = ds.drop_vars([ var for var in ds.variables if "time" not in ds[var].dims ]) ds_timeless = ds.drop_vars( [var for var in ds.variables if "time" in ds[var].dims]) # Resample ds_time = ds_time.resample(time=timeFreq).mean("time") # Add all dimensions to ds, and fix attributes for dim in ds_time.dims: if dim == "time": ds_time[dim].attrs = ds_dims[dim].attrs else: ds_time[dim] = ds_dims[dim] # Merge ds = _xr.merge([ds_time, ds_timeless]) ds.attrs = attrs # Update oceandataset od._ds = ds # Add time midp if timeFreq and "time" not in dropAxes: od = od.set_grid_coords({ **od.grid_coords, "time": { "time": -0.5 } }, add_midp=True, overwrite=True) # Drop axes grid_coords = od.grid_coords for coord in list(grid_coords): if coord in dropAxes: grid_coords.pop(coord, None) od = od.set_grid_coords(grid_coords, overwrite=True) # Cut axis can't be periodic od = od.set_grid_periodic(periodic) return od
def roc( observations, forecasts, bin_edges="continuous", dim=None, drop_intermediate=False, return_results="area", ): """Computes the relative operating characteristic for a range of thresholds. Parameters ---------- observations : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. If ``bin_edges=='continuous'``, observations are binary. forecasts : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. If ``bin_edges=='continuous'``, forecasts are probabilities. bin_edges : array_like, str, default='continuous' Bin edges for categorising observations and forecasts. Similar to np.histogram, \ all but the last (righthand-most) bin include the left edge and exclude the \ right edge. The last bin includes both edges. ``bin_edges`` will be sorted in \ ascending order. If ``bin_edges=='continuous'``, calculate ``bin_edges`` from \ forecasts, equal to ``sklearn.metrics.roc_curve(f_boolean, o_prob)``. dim : str, list The dimension(s) over which to compute the contingency table drop_intermediate : bool, default=False Whether to drop some suboptimal thresholds which would not appear on a plotted ROC curve. This is useful in order to create lighter ROC curves. Defaults to ``True`` in ``sklearn.metrics.roc_curve``. return_results: str, default='area' Specify how return is structed: - 'area': return only the ``area under curve`` of ROC - 'all_as_tuple': return ``true positive rate`` and ``false positive rate`` at each bin and area under the curve of ROC as tuple - 'all_as_metric_dim': return ``true positive rate`` and ``false positive rate`` at each bin and ``area under curve`` of ROC concatinated into new ``metric`` dimension Returns ------- xarray.Dataset or xarray.DataArray : reduced by dimensions ``dim``, see ``return_results`` parameter. ``true positive rate`` and ``false positive rate`` contain ``probability_bin`` dimension with ascending ``bin_edges`` as coordinates. Examples -------- >>> f = xr.DataArray( ... np.random.normal(size=(1000)), coords=[('time', np.arange(1000))] ... ) >>> o = xr.DataArray( ... np.random.normal(size=(1000)), ... coords=[('time', np.arange(1000))] ... ) >>> category_edges = np.linspace(-2, 2, 5) >>> roc(o, f, category_edges, dim=['time']) <xarray.DataArray 'histogram_observations_forecasts' ()> array(0.46812223) See also -------- xskillscore.Contingency sklearn.metrics.roc_curve References ---------- http://www.cawcr.gov.au/projects/verification/ """ if dim is None: dim = list(forecasts.dims) if isinstance(dim, str): dim = [dim] continuous = False if isinstance(bin_edges, str): if bin_edges == "continuous": continuous = True # check that o binary if isinstance(observations, xr.Dataset): o_check = observations.to_array() else: o_check = observations if str(o_check.dtype) != "bool": if not ((o_check == 0) | (o_check == 1)).all(): raise ValueError( 'Input "observations" must represent logical (True/False) outcomes', o_check, ) # works only for 1var if isinstance(forecasts, xr.Dataset): varlist = list(forecasts.data_vars) if len(varlist) == 1: v = varlist[0] else: raise ValueError( "Only works for `xr.Dataset` with one variable, found" f"{forecasts.data_vars}. Considering looping over `data_vars`" "or `.to_array()`.") f_bin = forecasts[v] else: f_bin = forecasts f_bin = f_bin.stack(ndim=forecasts.dims) f_bin = f_bin.sortby(-f_bin) bin_edges = np.append(f_bin[0] + 1, f_bin) bin_edges = np.unique(bin_edges) # ensure that in ascending order else: raise ValueError("If bin_edges is str, it can only be continuous.") else: bin_edges = np.sort(bin_edges) # ensure that in ascending order # loop over each bin_edge and get true positive rate and false positive rate # from contingency tpr, fpr = [], [] for i in bin_edges: dichotomous_category_edges = np.array( [-np.inf, i, np.inf]) # "dichotomous" means two-category dichotomous_contingency = Contingency( observations, forecasts, dichotomous_category_edges, dichotomous_category_edges, dim=dim, ) fpr.append(dichotomous_contingency.false_alarm_rate()) tpr.append(dichotomous_contingency.hit_rate()) tpr = xr.concat(tpr, "probability_bin") fpr = xr.concat(fpr, "probability_bin") tpr["probability_bin"] = bin_edges fpr["probability_bin"] = bin_edges fpr = fpr.fillna(1.0) tpr = tpr.fillna(0.0) # pad (0,0) and (1,1) fpr_pad = xr.concat( [ xr.ones_like(fpr.isel(probability_bin=0, drop=False)), fpr, xr.zeros_like(fpr.isel(probability_bin=-1, drop=False)), ], "probability_bin", ) tpr_pad = xr.concat( [ xr.ones_like(tpr.isel(probability_bin=0, drop=False)), tpr, xr.zeros_like(tpr.isel(probability_bin=-1, drop=False)), ], "probability_bin", ) if drop_intermediate and fpr.probability_bin.size > 2: fpr, tpr = _drop_intermediate(fpr, tpr) fpr_pad, tpr_pad = _drop_intermediate(fpr_pad, tpr_pad) area = _auc(fpr_pad, tpr_pad) if continuous: # sklearn returns in reversed order fpr = fpr.sortby(-fpr.probability_bin) tpr = tpr.sortby(-fpr.probability_bin) # mask always nan def _keep_masked(new, ori, dim): """Keep mask from `ori` deprived of dimensions from `dim` in input `new`.""" isel_dim = {d: 0 for d in forecasts.dims if d in dim} mask = ori.isel(isel_dim, drop=True) new_masked = new.where(mask.notnull()) return new_masked fpr = _keep_masked(fpr, forecasts, dim=dim) tpr = _keep_masked(tpr, forecasts, dim=dim) area = _keep_masked(area, forecasts, dim=dim) if return_results == "area": return area elif return_results == "all_as_metric_dim": results = xr.concat([fpr, tpr, area], "metric", coords="minimal") results["metric"] = [ "false positive rate", "true positive rate", "area under curve", ] return results elif return_results == "all_as_tuple": return fpr, tpr, area else: raise NotImplementedError( "expect `return_results` from [all_as_tuple, area, all_as_metric_dim], " f"found {return_results}")
def add_bias_column( x_data: xa.DataArray )-> xa.DataArray: return xa.concat([ x_data, xa.ones_like(x_data[:,0]) ], x_data.dims[1] )
def rps( observations, forecasts, category_edges, dim=None, fair=False, weights=None, keep_attrs=False, member_dim="member", ): """Calculate Ranked Probability Score. .. math:: RPS = \\sum_{m=1}^{M}[(\\sum_{k=1}^{m} y_k) - (\\sum_{k=1}^{m} o_k)]^{2} where ``y`` and ``o`` are forecast and observation probabilities in ``M`` categories. .. note:: Takes the sum over all categories as in Weigel et al. 2007 and not the mean as in https://www.cawcr.gov.au/projects/verification/verif_web_page.html#RPS. Therefore RPS has no upper boundary. Parameters ---------- observations : xarray.Dataset or xarray.DataArray The observations of the event. Further requirements are specified based on ``category_edges``. forecasts : xarray.Dataset or xarray.DataArray The forecast of the event with dimension specified by ``member_dim``. Further requirements are specified based on ``category_edges``. category_edges : array_like, xr.Dataset, xr.DataArray, None Edges (left-edge inclusive) of the bins used to calculate the cumulative density function (cdf). Note that here the bins have to include the full range of observations and forecasts data. Effectively, negative infinity is appended to the left side of category_edges, and positive infinity is appended to the right side. Thus, N category edges produces N+1 bins. For example, specifying category_edges = [0,1] will compute the RPS for bins [-inf, 0), [0, 1) and [1, inf), which results in CDF bins [-inf, 0), [-inf, 1) and [-inf, inf). Note that the edges are right-edge exclusive. Forecasts, observations and category_edge are expected in absolute units or probabilities consistently. - np.array (1d): will be internally converted and broadcasted to observations. Use this if you wish to use the same category edges for all elements of both forecasts and observations. - xr.Dataset/xr.DataArray: edges of the categories provided as dimension ``category_edge`` with optional category labels as ``category_edge`` coordinate. Use xr.Dataset/xr.DataArray if edges multi-dimensional and vary across dimensions. Use this if your category edges vary across dimensions of forecasts and observations, but are the same for both. - tuple of np.array/xr.Dataset/xr.DataArray: same as above, where the first item is taken as ``category_edges`` for observations and the second item for ``category_edges`` for forecasts. Use this if your category edges vary across dimensions of forecasts and observations, and are different for each. - None: expect than observations and forecasts are already CDFs containing ``category_edge`` dimension. Use this if your category edges vary across dimensions of forecasts and observations, and are different for each. dim : str or list of str, optional Dimension over which to mean after computing ``rps``. This represents a mean over multiple forecasts-observations pairs. Defaults to None implying averaging over all dimensions. fair: boolean Apply ensemble member-size adjustment for unbiased, fair metric; see Ferro (2013). weights : xr.DataArray with dimensions from dim, optional Weights for `weighted.mean(dim)`. Defaults to None, such that no weighting is applied. keep_attrs : bool If True, the attributes (attrs) will be copied from the first input to the new one. If False (default), the new object will be returned without attributes. member_dim : str, optional Name of ensemble member dimension. By default, 'member'. Returns ------- xarray.Dataset or xarray.DataArray: ranked probability score with coords ``forecasts_category_edge`` and ``observations_category_edge`` as str Examples -------- >>> observations = xr.DataArray(np.random.random(size=(3, 3)), ... coords=[('x', np.arange(3)), ... ('y', np.arange(3))]) >>> forecasts = xr.DataArray(np.random.random(size=(3, 3, 3)), ... coords=[('x', np.arange(3)), ... ('y', np.arange(3)), ... ('member', np.arange(3))]) >>> category_edges = np.array([.33, .66]) >>> xs.rps(observations, forecasts, category_edges, dim='x') <xarray.DataArray (y: 3)> array([0.14814815, 0.7037037 , 1.51851852]) Coordinates: * y (y) int64 0 1 2 forecasts_category_edge <U38 '[-np.inf, 0.33), [0.33, 0.66), [0.66, np.inf]' observations_category_edge <U38 '[-np.inf, 0.33), [0.33, 0.66), [0.66, np.inf]' You can also define multi-dimensional ``category_edges``, e.g. with xr.quantile. However, you still need to ensure that ``category_edges`` covers the forecasts and observations distributions. >>> category_edges = observations.quantile( ... q=[.33, .66]).rename({'quantile': 'category_edge'}), >>> xs.rps(observations, forecasts, category_edges, dim='x') <xarray.DataArray (y: 3)> array([1.18518519, 0.85185185, 0.40740741]) Coordinates: * y (y) int64 0 1 2 forecasts_category_edge <U38 '[-np.inf, 0.33), [0.33, 0.66), [0.66, np.inf]' observations_category_edge <U38 '[-np.inf, 0.33), [0.33, 0.66), [0.66, np.inf]' References ---------- * Weigel, A. P., Liniger, M. A., & Appenzeller, C. (2007). The Discrete Brier and Ranked Probability Skill Scores. Monthly Weather Review, 135(1), 118–124. doi: 10/b59qz5 * C. A. T. Ferro. Fair scores for ensemble forecasts. Q.R.J. Meteorol. Soc., 140: 1917–1923, 2013. doi: 10.1002/qj.2270. * https://www-miklip.dkrz.de/about/problems/ """ bin_dim = "category_edge" if member_dim not in forecasts.dims: raise ValueError( f"Expect to find {member_dim} in forecasts dimensions, found" f"{forecasts.dims}.") if fair: M = forecasts[member_dim].size forecasts = _bool_to_int(forecasts) _check_identical_xr_types(observations, forecasts) # different entry point of calculating RPS based on category_edges # category_edges tuple of two: use for obs and forecast category_edges separately if isinstance(category_edges, (tuple, np.ndarray, xr.DataArray, xr.Dataset)): if isinstance(category_edges, tuple): assert isinstance(category_edges[0], type(category_edges[1])) observations_edges, forecasts_edges = category_edges else: # category_edges only given once, so use for both obs and forecasts observations_edges, forecasts_edges = category_edges, category_edges if isinstance(observations_edges, np.ndarray): # convert category_edges as xr object observations_edges = xr.DataArray( observations_edges, dims="category_edge", coords={"category_edge": observations_edges}, ) observations_edges = xr.ones_like( observations) * observations_edges forecasts_edges = xr.DataArray( forecasts_edges, dims="category_edge", coords={"category_edge": forecasts_edges}, ) forecasts_edges = ( xr.ones_like(forecasts if member_dim not in forecasts.dims else forecasts.isel({member_dim: 0}, drop=True)) * forecasts_edges) _check_identical_xr_types(forecasts_edges, forecasts) _check_identical_xr_types(observations_edges, forecasts) # cumulative probability functions # lowest category is [-np.inf, category_edges.isel(category_edge=0)] # ignores the right-most edge. The effective right-most edge is np.inf. # therefore the CDFs Fc and Oc both reach 1 for the right-most edge. # < makes edges right-edge exclusive Fc = (forecasts < forecasts_edges).mean(member_dim) Oc = (observations < observations_edges).astype("int") elif category_edges is None: # expect CDFs already as inputs if member_dim in forecasts.dims: forecasts = forecasts.mean(member_dim) Fc = forecasts Oc = observations else: raise ValueError( "category_edges must be xr.DataArray, xr.Dataset, tuple of xr.objects, " f" None or array-like, found {type(category_edges)}") # RPS formulas if fair: # for ensemble member adjustment, see Ferro 2013 Ec = Fc * M res = ((Ec / M - Oc)**2 - Ec * (M - Ec) / (M**2 * (M - 1))).sum(bin_dim) else: # normal formula res = ((Fc - Oc)**2).sum(bin_dim) # add category_edge as str into coords if category_edges is not None: res = _assign_rps_category_bounds(res, observations_edges, "observations") res = _assign_rps_category_bounds(res, forecasts_edges, "forecasts") if weights is not None: res = res.weighted(weights) # combine many forecasts-observations pairs res = res.mean(dim) # keep nans and prevent 0 for all nan grids res = _keep_nans_masked(observations, res, dim, ignore=["category_edge"]) if keep_attrs: # attach by hand res.attrs.update(observations.attrs) res.attrs.update(forecasts.attrs) if isinstance(res, xr.Dataset): for v in res.data_vars: res[v].attrs.update(observations[v].attrs) res[v].attrs.update(forecasts[v].attrs) return res
def merged_mask( basins, ds, lon_name="lon", lat_name="lat", merge_dict=None, verbose=False ): """Combine geographical basins (from regionmask) to larger ocean basins. Parameters ---------- basins : regionmask.core.regions.Regions object Loaded basin data from regionmask, e.g. `import regionmask;basins = regionmask.defined_regions.natural_earth.ocean_basins_50` ds : xr.Dataset Input dataset on which to construct the mask lon_name : str, optional Name of the longitude coordinate in `ds`, defaults to `lon` lat_name : str, optional Name of the latitude coordinate in `ds`, defaults to `lat` merge_dict : dict, optional dictionary defining new aggregated regions (as keys) and the regions to be merge into that region as as values (list of names). Defaults to large scale ocean basins defined by `cmip6_preprocessing.regionmask.default_merge_dict` verbose : bool, optional Prints more output, e.g. the regions in `basins` that were not used in the merging step. Defaults to False. Returns ------- mask : xr.DataArray The mask contains ascending numeric value for each key ( merged region) in `merge_dict`. When the default is used the numeric values correspond to the following regions: * 0: North Atlantic * 1: South Atlantic * 2: North Pacific * 3: South Pacific * 4: Maritime Continent * 5: Indian Ocean * 6: Arctic Ocean * 7: Southern Ocean * 8: Black Sea * 9: Mediterranean Sea *10: Red Sea *11: Caspian Sea """ mask = basins.mask(ds, lon_name=lon_name, lat_name=lat_name) def find_mask_index(name): target_value = [ ri for ri in range(len(basins.regions)) if basins.regions[ri].name == name ] if len(target_value) > 1: warnings.warn(f"Found more than one matching region for {name}") return target_value[0] elif len(target_value) == 1: return target_value[0] else: return None if merge_dict is None: merge_dict = _default_merge_dict() dict_keys = list(merge_dict.keys()) number_dict = {k: None for k in dict_keys} merged_basins = [] for ocean, small_basins in merge_dict.items(): # ocean_idx = find_mask_index(ocean) try: ocean_idx = basins.map_keys(ocean) except (KeyError): # The ocean key is new and cant be found in the previous keys (e.g. for Atlantic full or maritime continent) ocean_idx = mask.max().data + 1 number_dict[ocean] = ocean_idx if small_basins: for sb in small_basins: sb_idx = basins.map_keys(sb) # set the index of each small basin to the ocean value mask = mask.where(mask != sb_idx, ocean_idx) merged_basins.append(sb) if verbose: remaining_basins = [ str(basins.regions[ri].name) for ri in range(len(basins.regions)) if (basins.regions[ri].name not in merged_basins) and (basins.regions[ri].name not in list(merge_dict.keys())) ] print(remaining_basins) # reset the mask indicies to the order of the passed dictionary keys mask_reordered = xr.ones_like(mask.copy()) * np.nan for new_idx, k in enumerate(dict_keys): old_idx = number_dict[k] mask_reordered = mask_reordered.where(mask != old_idx, new_idx) return mask_reordered
def augment_static_data( dynamic_ds: xr.Dataset, static_ds: xr.Dataset, test_year: Optional[List[str]] = None, dynamic_ignore_vars: List[str] = None, global_means: bool = True, spatial_means: bool = True, ) -> xr.Dataset: """ Create our own aggregations from the dynamic data NOTE: unnecessary for CAMELS because this data can just be taken from pre-computed means """ # get the minimum test_year if isinstance(test_year, Iterable): test_year = min(test_year) # PREVENT temporal leakage of information min_test_date = pd.to_datetime(f"{test_year}-01-01") max_train_date = min_test_date - Day(1) min_train_date = pd.to_datetime(dynamic_ds.time.min().values) dynamic_ds = dynamic_ds.sel(time=slice(min_train_date, max_train_date)) # augment the static data with the variables from dynamic_ds original_vars = list(dynamic_ds.data_vars) if dynamic_ignore_vars is not None: vars_list = [ v for v in original_vars if v not in dynamic_ignore_vars ] else: vars_list = original_vars print("Augmenting the static data with" f" {'global_means' if global_means else ''}" f" {'spatial_means' if spatial_means else ''}" f"for variables: {vars_list}") # check they have the same coords and dtypes reference_coord = [c for c in static_ds.coords][0] assert reference_coord in list( dynamic_ds.coords), (f"Static: {list(static_ds.coords)}" f" Dynamic: {list(dynamic_ds.coords)}") assert static_ds[reference_coord].dtype == dynamic_ds[ reference_coord].dtype, ( f"Static: {static_ds[reference_coord].dtype}" f" Dynamic: {dynamic_ds[reference_coord].dtype}") # calculate ones same shape as the static data first_var = list(static_ds.data_vars)[0] ones = xr.ones_like(static_ds[first_var]) # for each NON-IGNORED dynamic variable calculate global_mean / spatial_mean list_data_arrays: List[xr.DataArray] = [] for var in vars_list: if global_means: # GLOBAL mean global_mean_values = dynamic_ds[var].mean() global_mean_da = (global_mean_values * ones).rename(f"{var}_global_mean") list_data_arrays.append(global_mean_da) if spatial_means: # spatial mean spatial_mean_values = dynamic_ds[var].mean(dim="time") spatial_mean_da = (spatial_mean_values * ones).rename(f"{var}_spatial_mean") list_data_arrays.append(spatial_mean_da) if list_data_arrays != []: # join these new calculated variables into the original ds = xr.combine_by_coords(list_data_arrays) static_ds = static_ds.merge(ds) return static_ds
def arct_connect(ds, varName, faces="all"): arc_cap = 6 Nx_ac_nrot = [] Ny_ac_nrot = [] Nx_ac_rot = [] Ny_ac_rot = [] ARCT = [0, 0, 0, 0] # initialize the list. arc_faces = [0, 0, 0, 0] metrics = [ "dxC", "dyC", "dxG", "dyG", "hFacW", "hFacS", ] # metric variables defined at vector points if isinstance(faces, str): if faces == "all": faces = [k for k in range(13)] if arc_cap in faces: for k in faces: if k == 2: fac = 1 arc_faces[0] = k _varName = varName DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(DIMS[::-1]) dtr = list(dims)[::-1] dtr[-1], dtr[-2] = dtr[-2], dtr[-1] mask2 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) # TODO: Eval where, define argument outside mask2 = mask2.where( _np.logical_and( ds[dims.X] < ds[dims.Y], ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y], )) x0, xf = 0, int(len(ds[dims.Y]) / 2) # TODO: CHECK here! y0, yf = 0, int(len(ds[dims.X])) xslice = slice(x0, xf) yslice = slice(y0, yf) Nx_ac_nrot.append(0) Ny_ac_nrot.append(len(ds[dims.Y][y0:yf])) da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice} mask_arg = {dims.X: xslice, dims.Y: yslice} if len(dims.X) + len(dims.Y) == 4: if len(dims.Y) == 3 and _varName not in metrics: fac = -1 arct = fac * ds[_varName].isel(**da_arg) Mask = mask2.isel(**mask_arg) arct = arct * Mask ARCT[0] = arct elif k == 5: fac = 1 arc_faces[1] = k _varName = varName DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(DIMS[::-1]) mask5 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) mask5 = mask5.where( _np.logical_and( ds[dims.X] > ds[dims.Y], ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y], )) x0, xf = 0, int(len(ds[dims.X])) y0, yf = 0, int(len(ds[dims.Y]) / 2) xslice = slice(x0, xf) yslice = slice(y0, yf) Nx_ac_nrot.append(0) Ny_ac_nrot.append(len(ds[dims.X][y0:yf])) if len(dims.X) + len(dims.Y) == 4: if len(dims.Y) == 1 and _varName not in metrics: fac = -1 da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice} mask_arg = {dims.X: xslice, dims.Y: yslice} arct = ds[_varName].isel(**da_arg) Mask = mask5.isel(**mask_arg) arct = arct * Mask ARCT[1] = arct elif k == 7: fac = 1 arc_faces[2] = k _varName = varName DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(DIMS[::-1]) dtr = list(dims)[::-1] dtr[-1], dtr[-2] = dtr[-2], dtr[-1] mask7 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) mask7 = mask7.where( _np.logical_and( ds[dims.X] > ds[dims.Y], ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y], )) x0, xf = int(len(ds[dims.Y]) / 2), int(len(ds[dims.Y])) y0, yf = 0, int(len(ds[dims.X])) xslice = slice(x0, xf) yslice = slice(y0, yf) Nx_ac_rot.append(len(ds[dims.Y][x0:xf])) Ny_ac_rot.append(0) da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice} mask_arg = {dims.X: xslice, dims.Y: yslice} arct = fac * ds[_varName].isel(**da_arg) Mask = mask7.isel(**mask_arg) arct = arct * Mask ARCT[2] = arct elif k == 10: fac = 1 _varName = varName DIMS = [dim for dim in ds[_varName].dims if dim != "face"] dims = Dims(DIMS[::-1]) dtr = list(dims)[::-1] dtr[-1], dtr[-2] = dtr[-2], dtr[-1] arc_faces[3] = k mask10 = _xr.ones_like(ds[_varName].isel(face=arc_cap)) mask10 = mask10.where( _np.logical_and( ds[dims.X] < ds[dims.Y], ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y], )) x0, xf = 0, int(len(ds[dims.X])) y0, yf = int(len(ds[dims.Y]) / 2), int(len(ds[dims.Y])) xslice = slice(x0, xf) yslice = slice(y0, yf) Nx_ac_rot.append(0) Ny_ac_rot.append(len(ds[dims.Y][y0:yf])) da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice} mask_arg = {dims.X: xslice, dims.Y: yslice} arct = fac * ds[_varName].isel(**da_arg) Mask = mask10.isel(**mask_arg) arct = (arct * Mask).transpose(*dtr) ARCT[3] = arct return arc_faces, Nx_ac_nrot, Ny_ac_nrot, Nx_ac_rot, Ny_ac_rot, ARCT
for i in range(len(ds["time"])): T = T + [np.datetime64(t0) + np.timedelta64(int(i * step * 1.0e3), "ms")] ds["time"] = np.array(T, dtype="datetime64") T = [] for i in range(len(ds["time_midp"])): T = T + [np.datetime64(t0) + np.timedelta64(int(i * step * 1.0e3), "ms")] ds["time_midp"] = np.array(T, dtype="datetime64") + np.timedelta64( int(0.5 * step * 1.0e3), "ms") # deltas for var in ["drF", "dxC", "dyC", "dxF", "dyF", "dxG", "dyG", "dxV", "dyU"]: ds[var] = xr.full_like(ds[var], step) for var in ["rA", "rAw", "rAs", "rAz"]: ds[var] = xr.full_like(ds[var], step**2) for var in ["HFacC", "HFacW", "HFacS"]: ds[var] = xr.ones_like(ds[var]) # Recreate oceandataset od4calc = OceanDataset(ds) # Gradient sinX = xr.zeros_like(od4calc.dataset["Temp"]) + np.sin(od4calc.dataset["XC"]) sinY = xr.zeros_like(od4calc.dataset["Temp"]) + np.sin(od4calc.dataset["YC"]) sinZ = xr.zeros_like(od4calc.dataset["Temp"]) + np.sin(od4calc.dataset["Z"]) sintime = xr.zeros_like(od4calc.dataset["Temp"]) + np.sin( (od4calc.dataset["time"] - od4calc.dataset["time"][0]) / np.timedelta64(1, "s")) sintime.attrs = od4calc.dataset["time"].attrs cosX = xr.zeros_like(od4calc.dataset["U"]) + np.cos(od4calc.dataset["XU"]) cosY = xr.zeros_like(od4calc.dataset["V"]) + np.cos(od4calc.dataset["YV"])