def test_make_filename(): path = Path("reanalysis-era5-single-levels-monthly-means" "/2m_temperature/1979_2019/01_12.nc") name = ERA5MonthlyMeanPreprocessor.create_filename(path, "kenya") expected_name = "1979_2019_01_12_2m_temperature_kenya.nc" assert name == expected_name, f"{name} generated, expected {expected_name}"
def test_make_filename(): path = Path('reanalysis-era5-single-levels-monthly-means' '/2m_temperature/1979_2019/01_12.nc') name = ERA5MonthlyMeanPreprocessor.create_filename(path, 'kenya') expected_name = '1979_2019_01_12_2m_temperature_kenya.nc' assert name == expected_name, f'{name} generated, expected {expected_name}'
def preprocess_era5(): data_path = get_data_path() regrid_path = data_path / "interim/VCI_preprocessed/data_kenya.nc" assert regrid_path.exists(), f"{regrid_path} not available" processor = ERA5MonthlyMeanPreprocessor(data_path) processor.preprocess(subset_str="kenya", regrid=regrid_path)
def test_init(self, tmp_path): ERA5MonthlyMeanPreprocessor(tmp_path) assert (tmp_path / 'interim/reanalysis-era5-single-levels-' 'monthly-means_preprocessed').exists() assert (tmp_path / 'interim/reanalysis-era5-single-levels-' 'monthly-means_interim').exists()
def preprocess_era5(subset_str: str = "kenya"): data_path = get_data_path() # regrid_path = data_path / f"interim/reanalysis-era5-land_preprocessed/data_{subset_str}.nc" # assert regrid_path.exists(), f"{regrid_path} not available" regrid_path = None processor = ERA5MonthlyMeanPreprocessor(data_path) processor.preprocess(subset_str=subset_str, regrid=regrid_path)
def preprocess_era5(): if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought': data_path = Path('data') else: data_path = Path('../data') regrid_path = data_path / 'interim/VCI_preprocessed/data_kenya.nc' assert regrid_path.exists(), f'{regrid_path} not available' processor = ERA5MonthlyMeanPreprocessor(data_path) processor.preprocess(subset_str='kenya', regrid=regrid_path)
def test_preprocess(self, tmp_path): (tmp_path / "raw/reanalysis-era5-single-levels-monthly-means/" "2m_temperature/1979_2019").mkdir(parents=True) data_path = (tmp_path / "raw/reanalysis-era5-single-levels-monthly-means/" "2m_temperature/1979_2019/01_12.nc") dataset = self._make_era5_dataset(size=(100, 100)) dataset.to_netcdf(path=data_path) kenya = get_kenya() regrid_dataset, _, _ = _make_dataset( size=(20, 20), latmin=kenya.latmin, latmax=kenya.latmax, lonmin=kenya.lonmin, lonmax=kenya.lonmax, ) regrid_path = tmp_path / "regridder.nc" regrid_dataset.to_netcdf(regrid_path) processor = ERA5MonthlyMeanPreprocessor(tmp_path) processor.preprocess(subset_str="kenya", regrid=regrid_path, parallel=False) expected_out_path = (tmp_path / "interim/reanalysis-era5-single-levels-monthly-" "means_preprocessed/data_kenya.nc") assert (expected_out_path.exists( )), f"Expected processed file to be saved to {expected_out_path}" # check the subsetting happened correctly out_data = xr.open_dataset(expected_out_path) expected_dims = ["lat", "lon", "time"] assert len(list(out_data.dims)) == len(expected_dims) for dim in expected_dims: assert dim in list( out_data.dims ), f"Expected {dim} to be in the processed dataset dims" lons = out_data.lon.values assert (lons.min() >= kenya.lonmin) and ( lons.max() <= kenya.lonmax), "Longitudes not correctly subset" lats = out_data.lat.values assert (lats.min() >= kenya.latmin) and ( lats.max() <= kenya.latmax), "Latitudes not correctly subset" assert out_data.t2m.values.shape[1:] == (20, 20) assert (not processor.interim.exists() ), f"Interim era5 folder should have been deleted"
def test_get_filenames(tmp_path): (tmp_path / 'raw/reanalysis-era5-single-levels-monthly-means/' '2m_temperature/1979_2019').mkdir(parents=True) test_file = tmp_path / 'raw/reanalysis-era5-single-levels-' \ 'monthly-means/2m_temperature/1979_2019.01_12.nc' test_file.touch() processor = ERA5MonthlyMeanPreprocessor(tmp_path) files = processor.get_filepaths() assert files[0] == test_file, f'Expected {test_file} to be retrieved'
def test_make_filename(tmp_path, granularity): if granularity == "monthly": basename = "reanalysis-era5-single-levels-monthly-means" processor = ERA5MonthlyMeanPreprocessor(tmp_path) elif granularity == "hourly": basename = "reanalysis-era5-single-levels" processor = ERA5HourlyPreprocessor(tmp_path) path = Path(basename + "/2m_temperature/1979_2019/01_12.nc") name = processor.create_filename(path, "kenya") expected_name = "1979_2019_01_12_2m_temperature_kenya.nc" assert name == expected_name, f"{name} generated, expected {expected_name}"
def test_get_filenames(tmp_path, granularity): if granularity == "monthly": basename = "reanalysis-era5-single-levels-monthly-means" processor = ERA5MonthlyMeanPreprocessor(tmp_path) elif granularity == "hourly": basename = "reanalysis-era5-single-levels" processor = ERA5HourlyPreprocessor(tmp_path) (tmp_path / f"raw/{basename}/" "2m_temperature/1979_2019").mkdir(parents=True) test_file = tmp_path / f"raw/{basename}/" "/2m_temperature/1979_2019.01_12.nc" test_file.touch() files = processor.get_filepaths() assert files[0] == test_file, f"Expected {test_file} to be retrieved"
data_dir = Path('/soge-home/projects/crop_yield/ml_drought/data/') base_out_dir = Path('/soge-home/projects/crop_yield/hackathon/') final_out_dir = Path('/soge-home/projects/crop_yield/hackathon/v_wind') v_wind_dir = Path( '/soge-home/data/analysis/era5/0.28125x0.28125/hourly/v_component_of_wind/nc' ) # make directories out_dir = base_out_dir / 'africa' / 'v_component_of_wind_hourly' if not out_dir.exists(): out_dir.mkdir(exist_ok=True, parents=True) if not final_out_dir.exists(): final_out_dir.mkdir(exist_ok=True, parents=True) processor = ERA5MonthlyMeanPreprocessor(data_dir) # # SUBSET AFRICA from hourly files # nc_files = [f for f in v_wind_dir.glob('*.nc')] # for netcdf_filepath in nc_files: # ds = xr.open_dataset(netcdf_filepath).rename( # {'longitude': 'lon', 'latitude': 'lat'} # ) # ds = processor.chop_roi(ds, subset_str='africa', inverse_lat=True) # ds.to_netcdf(out_dir / file.name) # print(f'Done for {file.name}') # JOIN ALL FILES AND MAKE DAILY ds = xr.open_mfdataset(str(out_dir / '*.nc'), chunks={'time': 1}) ds = processor.resample_time(ds, resample_length='D', upsampling=False)