def _make_boku_ndvi_dataset( size, lonmin=-180.0, lonmax=180.0, latmin=-55.152, latmax=75.024, kenya_only=False, ): lat_len, lon_len = size if kenya_only: kenya = get_kenya() latmin = kenya.latmin latmax = kenya.latmax lonmin = kenya.lonmin lonmax = kenya.lonmax # create the vector longitudes = np.linspace(lonmin, lonmax, lon_len) latitudes = np.linspace(latmin, latmax, lat_len) dims = ["lat", "lon"] coords = {"lat": latitudes, "lon": longitudes} modis_vals = np.append(np.arange(1, 252), 255) data = np.random.choice(modis_vals, size=size) return xr.Dataset({"boku_ndvi": (dims, data)}, coords=coords)
def test_preprocess(self, tmp_path): (tmp_path / "raw/boku_ndvi_1000").mkdir(parents=True) RAW_FILES = [ "MCD13A2.t200915.006.EAv1.1_km_10_days_NDVI.O1.nc", "MCD13A2.t201107.006.EAv1.1_km_10_days_NDVI.O1.nc", "MCD13A2.t201330.006.EAv1.1_km_10_days_NDVI.O1.nc", "MCD13A2.t201733.006.EAv1.1_km_10_days_NDVI.O1.nc", ] for raw_file in RAW_FILES: data_path = tmp_path / f"raw/boku_ndvi_1000/{raw_file}" dataset = self._make_boku_ndvi_dataset(size=(100, 100), kenya_only=True) dataset.to_netcdf(path=data_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset( size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax, ) regrid_path = tmp_path / "regridder.nc" regrid_dataset.to_netcdf(regrid_path) processor = BokuNDVIPreprocessor(tmp_path) processor.preprocess(subset_str="kenya", regrid=regrid_path, cleanup=True) expected_out_path = ( tmp_path / "interim/boku_ndvi_1000_preprocessed/data_kenya.nc") assert (expected_out_path.exists( )), f"Expected processed file to be saved to {expected_out_path}" # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_path) expected_dims = ["lat", "lon", "time"] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list( out_data.dims ), f"Expected {dim} to be in the processed dataset dims" lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and ( lons.max() <= kenya.lonmax), "Longitudes not correctly subset" lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and ( lats.max() <= kenya.latmax), "Latitudes not correctly subset" assert out_data["boku_ndvi"].values.shape[1:] == (20, 20) assert (not processor.interim.exists() ), f"Interim boku_ndvi folder should have been deleted"
def test_preprocess(self, tmp_path, cleanup): (tmp_path / 'raw/esa_cci_landcover').mkdir(parents=True) data_path = tmp_path / 'raw/esa_cci_landcover/1992-v2.0.7b_testy_test.nc' dataset = self._make_ESA_CCI_dataset(size=(100, 100)) dataset.to_netcdf(path=data_path) legend_path = tmp_path / 'raw/esa_cci_landcover/legend.csv' self._make_ESA_CCI_legend().to_csv(legend_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset(size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax) regrid_path = tmp_path / 'regridder.nc' regrid_dataset.to_netcdf(regrid_path) processor = ESACCIPreprocessor(tmp_path) processor.preprocess(subset_str='kenya', regrid=regrid_path, cleanup=cleanup) expected_out_path = (tmp_path / 'interim/static/esa_cci_landcover_interim' '/1992_1992-v2.0.7b_testy_test_kenya.nc') if not cleanup: assert expected_out_path.exists(), \ f'Expected processed file to be saved to {expected_out_path}' expected_out_processed = (tmp_path / 'interim/static/esa_cci_landcover_' 'preprocessed' / 'esa_cci_landcover_kenya_one_hot.nc') assert expected_out_processed.exists(), \ 'expected a processed folder' # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_processed) expected_dims = ['lat', 'lon'] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list(out_data.dims), \ f'Expected {dim} to be in the processed dataset dims' lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \ 'Longitudes not correctly subset' lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \ 'Latitudes not correctly subset' if cleanup: assert not processor.interim.exists(), \ f'Interim esa_cci_landcover folder should have been deleted'
def test_preprocess(self, tmp_path): (tmp_path / "raw/reanalysis-era5-land/" "2m_temperature/1979_2019").mkdir(parents=True) data_path = (tmp_path / "raw/reanalysis-era5-land/" "2m_temperature/1979_2019/01_12.nc") dataset = self._make_era5_dataset(size=(100, 100)) dataset.to_netcdf(path=data_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset( size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax, ) regrid_path = tmp_path / "regridder.nc" regrid_dataset.to_netcdf(regrid_path) processor = ERA5LandPreprocessor(tmp_path) processor.preprocess( subset_str="kenya", regrid=regrid_path, parallel_processes=1, variable="2m_temperature", ) expected_out_path = ( tmp_path / "interim/reanalysis-era5" "-land_preprocessed/reanalysis-era5-land_kenya.nc") assert (expected_out_path.exists( )), f"Expected processed file to be saved to {expected_out_path}" # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_path) expected_dims = ["lat", "lon", "time"] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list( out_data.dims ), f"Expected {dim} to be in the processed dataset dims" lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and ( lons.max() <= kenya.lonmax), "Longitudes not correctly subset" lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and ( lats.max() <= kenya.latmax), "Latitudes not correctly subset" assert out_data.t2m.values.shape[1:] == (20, 20) assert (not processor.interim.exists() ), f"Interim era5 folder should have been deleted"
def test_preprocess(self, tmp_path): (tmp_path / "raw/era5POS/global").mkdir(parents=True) data_path = tmp_path / "raw/era5POS/global/testy_test.nc" dataset = self._make_era5POS_dataset(size=(100, 100)) dataset.to_netcdf(path=data_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset( size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax, ) regrid_path = tmp_path / "regridder.nc" regrid_dataset.to_netcdf(regrid_path) processor = PlanetOSPreprocessor(tmp_path) processor.preprocess(subset_str="kenya", regrid=regrid_path, parallel=False) expected_out_path = tmp_path / "interim/era5POS_preprocessed/data_kenya.nc" assert ( expected_out_path.exists() ), f"Expected processed file to be saved to {expected_out_path}" # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_path) expected_dims = ["lat", "lon", "time"] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list( out_data.dims ), f"Expected {dim} to be in the processed dataset dims" lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and ( lons.max() <= kenya.lonmax ), "Longitudes not correctly subset" lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and ( lats.max() <= kenya.latmax ), "Latitudes not correctly subset" assert out_data.VHI.values.shape[1:] == (20, 20) assert out_data.precip.values.shape[1:] == (20, 20) assert ( not processor.interim.exists() ), f"Interim era5 folder should have been deleted"
def test_preprocess(self, tmp_path): (tmp_path / "raw/gleam/monthly").mkdir(parents=True) data_path = tmp_path / "raw/gleam/monthly/testy_test.nc" dataset = self._make_gleam_dataset(size=(100, 100)) dataset.to_netcdf(path=data_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset( size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax, ) regrid_path = tmp_path / "regridder.nc" regrid_dataset.to_netcdf(regrid_path) processor = GLEAMPreprocessor(tmp_path) processor.preprocess(subset_str="kenya", regrid=regrid_path) expected_out_path = tmp_path / "interim/gleam_preprocessed/data_kenya.nc" assert ( expected_out_path.exists() ), f"Expected processed file to be saved to {expected_out_path}" # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_path) expected_dims = ["lat", "lon", "time"] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list( out_data.dims ), f"Expected {dim} to be in the processed dataset dims" lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and ( lons.max() <= kenya.lonmax ), "Longitudes not correctly subset" lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and ( lats.max() <= kenya.latmax ), "Latitudes not correctly subset" assert set(out_data.data_vars) == {"E"}, f"Got unexpected variables!" assert ( not processor.interim.exists() ), f"Interim gleam folder should have been deleted"
def test_preprocess(self, tmp_path): (tmp_path / 'raw/reanalysis-era5-single-levels-monthly-means/' '2m_temperature/1979_2019').mkdir(parents=True) data_path = tmp_path / 'raw/reanalysis-era5-single-levels-monthly-means/' \ '2m_temperature/1979_2019/01_12.nc' dataset = self._make_era5_dataset(size=(100, 100)) dataset.to_netcdf(path=data_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset(size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax) regrid_path = tmp_path / 'regridder.nc' regrid_dataset.to_netcdf(regrid_path) processor = ERA5MonthlyMeanPreprocessor(tmp_path) processor.preprocess(subset_str='kenya', regrid=regrid_path, parallel=False) expected_out_path = tmp_path / 'interim/reanalysis-era5-single-levels-monthly-' \ 'means_preprocessed/data_kenya.nc' assert expected_out_path.exists(), \ f'Expected processed file to be saved to {expected_out_path}' # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_path) expected_dims = ['lat', 'lon', 'time'] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list(out_data.dims), \ f'Expected {dim} to be in the processed dataset dims' lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \ 'Longitudes not correctly subset' lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \ 'Latitudes not correctly subset' assert out_data.t2m.values.shape[1:] == (20, 20) assert not processor.interim.exists(), \ f'Interim era5 folder should have been deleted'
def test_preprocess(self, tmp_path): (tmp_path / 'raw/gleam/monthly').mkdir(parents=True) data_path = tmp_path / 'raw/gleam/monthly/testy_test.nc' dataset = self._make_gleam_dataset(size=(100, 100)) dataset.to_netcdf(path=data_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset(size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax) regrid_path = tmp_path / 'regridder.nc' regrid_dataset.to_netcdf(regrid_path) processor = GLEAMPreprocessor(tmp_path) processor.preprocess(subset_str='kenya', regrid=regrid_path) expected_out_path = tmp_path / 'interim/gleam_preprocessed/data_kenya.nc' assert expected_out_path.exists(), \ f'Expected processed file to be saved to {expected_out_path}' # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_path) expected_dims = ['lat', 'lon', 'time'] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list(out_data.dims), \ f'Expected {dim} to be in the processed dataset dims' lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \ 'Longitudes not correctly subset' lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \ 'Latitudes not correctly subset' assert set(out_data.data_vars) == {'E'}, f'Got unexpected variables!' assert not processor.interim.exists(), \ f'Interim gleam folder should have been deleted'
def test_preprocess(self, tmp_path): (tmp_path / 'raw/chirps/global').mkdir(parents=True) data_path = tmp_path / 'raw/chirps/global/testy_test.nc' dataset = self._make_chirps_dataset(size=(100, 100)) dataset.to_netcdf(path=data_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset(size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax) regrid_path = tmp_path / 'regridder.nc' regrid_dataset.to_netcdf(regrid_path) processor = CHIRPSPreprocesser(tmp_path) processor.preprocess(subset_str='kenya', regrid=regrid_path, parallel=False) expected_out_path = tmp_path / 'interim/chirps_preprocessed/data_kenya.nc' assert expected_out_path.exists(), \ f'Expected processed file to be saved to {expected_out_path}' # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_path) expected_dims = ['lat', 'lon', 'time'] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list(out_data.dims), \ f'Expected {dim} to be in the processed dataset dims' lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and (lons.max() <= kenya.lonmax), \ 'Longitudes not correctly subset' lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and (lats.max() <= kenya.latmax), \ 'Latitudes not correctly subset' assert out_data.VHI.values.shape[1:] == (20, 20) assert not processor.interim.exists(), \ f'Interim chirps folder should have been deleted'
def test_preprocess(self, tmp_path): out_dir = tmp_path / "data" / "raw" / "s5" out_dir = ( out_dir / "seasonal-monthly-pressure-levels" / "2m_temperature" / str(2018) ) if not out_dir.exists(): out_dir.mkdir(exist_ok=True, parents=True) # preprocessor working with pretend ouce data (because writing to .grib is failing) ouce_dir = make_dummy_ouce_s5_data(tmp_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset( size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax, ) # the reference dataset to regrid to regrid_path = tmp_path / "regridder.nc" regrid_dataset.to_netcdf(regrid_path) # run the preprocessing processor = S5Preprocessor(tmp_path / "data", ouce_server=True) processor.preprocess( subset_str="kenya", regrid=regrid_path, variable="2m_temperature", cleanup=True, **dict(ouce_dir=ouce_dir.parents[2], infer=True), ) # check preprocessed file exists assert ( processor.preprocessed_folder / "s5_preprocessed" / "s5_t2m_kenya.nc" ).exists(), ( "Expecting to find the kenyan_subset netcdf file" "at the preprocessed / s5_preprocessed / s5_{variable}_{subset_str}.nc" ) # open the data out_data = xr.open_dataset( processor.preprocessed_folder / "s5_preprocessed" / "s5_t2m_kenya.nc" ) # check the subsetting happened properly expected_dims = [ "lat", "lon", "initialisation_date", "forecast_horizon", "number", ] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list( out_data.dims ), f"Expected {dim} to be in the processed dataset dims" lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and ( lons.max() <= kenya.lonmax ), "Longitudes not correctly subset" lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and ( lats.max() <= kenya.latmax ), "Latitudes not correctly subset" # check the lat/lon is the correct shape assert out_data.t2m.values.shape[-2:] == (20, 20) # test the stacking to select the forecast time # NOTE: this is how you select data from the S5 data for the `real time` out_data["valid_time"] = ( out_data.initialisation_date + out_data.forecast_horizon ) stacked = out_data.stack(time=("initialisation_date", "forecast_horizon")) assert stacked.time.shape == (10,), "should be a 1D vector" selected = stacked.swap_dims({"time": "valid_time"}).sel(valid_time="2008-03") assert selected.time.size == 6, ( "Should have only selected 6 timesteps" " for the month 2008-03. The calculation of valid_time is " "complicated but it should select the forecasts that enter into" "the month of interest." ) # check the cleanup has worked assert ( not processor.interim.exists() ), f"Interim S5 folder should have been deleted"
def test_preprocess(self, tmp_path, cleanup): (tmp_path / "raw/esa_cci_landcover").mkdir(parents=True) data_path = tmp_path / "raw/esa_cci_landcover/1992-v2.0.7b_testy_test.nc" dataset = self._make_ESA_CCI_dataset(size=(100, 100)) dataset.to_netcdf(path=data_path) legend_path = tmp_path / "raw/esa_cci_landcover/legend.csv" self._make_ESA_CCI_legend().to_csv(legend_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset( size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax, ) regrid_path = tmp_path / "regridder.nc" regrid_dataset.to_netcdf(regrid_path) processor = ESACCIPreprocessor(tmp_path) processor.preprocess(subset_str="kenya", regrid=regrid_path, cleanup=cleanup) expected_out_path = (tmp_path / "interim/static/esa_cci_landcover_interim" "/1992_1992-v2.0.7b_testy_test_kenya.nc") if not cleanup: assert (expected_out_path.exists( )), f"Expected processed file to be saved to {expected_out_path}" expected_out_processed = (tmp_path / "interim/static/esa_cci_landcover_" "preprocessed" / "esa_cci_landcover_kenya_one_hot.nc") assert expected_out_processed.exists(), "expected a processed folder" # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_processed) expected_dims = ["lat", "lon"] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list( out_data.dims ), f"Expected {dim} to be in the processed dataset dims" lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and ( lons.max() <= kenya.lonmax), "Longitudes not correctly subset" lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and ( lats.max() <= kenya.latmax), "Latitudes not correctly subset" if cleanup: assert ( not processor.interim.exists() ), f"Interim esa_cci_landcover folder should have been deleted"
variable='tp', time_period='dayofyear', hilo='low', method='std' ) e_runs = era.calculate_runs() chirp = EventDetector(chirps_dir) chirp.detect( variable='precip', time_period='month', hilo='low', method='std' ) c_runs = chirp.calculate_runs() mask = get_ds_mask(chirp.ds.precip) c_runs = c_runs.where(~mask) kenya = get_kenya() # ------------------------------------------------------------------------------ # # ------------------------------------------------------------------------------ c = chirp fig, ax = plt.subplots() c.clim.mean(dim=['lat','lon']).precip.plot(ax=ax) c.thresh.mean(dim=['lat','lon']).precip.plot(ax=ax) ax.set_title('Threshold & Climatology Values for Precip (monthly) [mm day-1]') fig, ax = plot_geog_location(kenya, lakes=False, borders=True, rivers=True, scale=0.8) c_runs.mean(dim='time').plot(ax=ax) ax.set_title('Mean Run Length (Consecutive Months with -1 STD)')
%load_ext autoreload %autoreload 2 data_dir = Path('data') # Initialise the ERA5 exporter e = ERA5Exporter(data_dir) # valid SEAS5 exporters valid_datasets = [ 'seasonal-original-single-levels', 'seasonal-original-pressure-levels', 'seasonal-monthly-single-levels', 'seasonal-monthly-pressure-levels', ] kenya_region = get_kenya() variable = 'total_precipitation' dataset = 'seasonal-original-single-levels' area = e.create_area(kenya_region) # times years = [_ for _ in range(2017,2019)] months = [_ for _ in range(1,13)] # leadtime_hour (0 - 215 days) leadtime_hours = [days * 24 for days in range(1, 20)] all_leadtimes = [days * 24 for days in range(1, 216)] \
def test_preprocess(self, tmp_path, granularity): if granularity == "monthly": basename = "reanalysis-era5-single-levels-monthly-means" processor = ERA5MonthlyMeanPreprocessor(tmp_path) elif granularity == "hourly": basename = "reanalysis-era5-single-levels" processor = ERA5HourlyPreprocessor(tmp_path) (tmp_path / f"raw/{basename}/" "2m_temperature/1979_2019").mkdir(parents=True) data_path = tmp_path / f"raw/{basename}/" "2m_temperature/1979_2019/01_12.nc" if granularity == "hourly": dataset = self._make_era5_dataset(size=(100, 100), monthly=False) else: dataset = self._make_era5_dataset(size=(100, 100), monthly=True) dataset.to_netcdf(path=data_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset( size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax, ) regrid_path = tmp_path / "regridder.nc" regrid_dataset.to_netcdf(regrid_path) processor.preprocess(subset_str="kenya", regrid=regrid_path, parallel=False) expected_out_path = ( tmp_path / f"interim/{basename}" "_preprocessed/data_kenya.nc" ) assert ( expected_out_path.exists() ), f"Expected processed file to be saved to {expected_out_path}" # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_path) expected_dims = ["lat", "lon", "time"] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list( out_data.dims ), f"Expected {dim} to be in the processed dataset dims" lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and ( lons.max() <= kenya.lonmax ), "Longitudes not correctly subset" lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and ( lats.max() <= kenya.latmax ), "Latitudes not correctly subset" assert out_data.t2m.values.shape[1:] == (20, 20) assert ( not processor.interim.exists() ), f"Interim era5 folder should have been deleted"