def test_timezone_adapter(self): c3s_data_folder = path.join( Dataset.objects.get(short_name='C3S').storage_path, 'C3S_V201706/TCDR/063_images_to_ts/combined-daily') c3s_reader = c3s_read(c3s_data_folder) timezone_reader = TimezoneAdapter(c3s_reader) orig_data = c3s_reader.read_ts(-155.42, 19.78) data = timezone_reader.read_ts(-155.42, 19.78) self.assertTrue( np.array_equal(orig_data.index.values, data.index.values)) self.assertTrue(not hasattr(data.index, 'tz') or data.index.tz is None) orig_data = c3s_reader.read(-155.42, 19.78) data = timezone_reader.read(-155.42, 19.78) self.assertTrue( np.array_equal(orig_data.index.values, data.index.values)) self.assertTrue((not hasattr(data.index, 'tz')) or (data.index.tz is None)) ismn_data_folder = path.join( Dataset.objects.get(short_name='ISMN').storage_path, 'ISMN_V20191211') ismn_reader = ISMN_Interface(ismn_data_folder) timezone_reader2 = TimezoneAdapter(ismn_reader) orig_data = ismn_reader.read_ts(0) data = timezone_reader2.read_ts(0) self.assertTrue( np.array_equal(orig_data.index.values, data.index.values)) self.assertTrue((not hasattr(data.index, 'tz')) or (data.index.tz is None))
def generate_station_list(): """ This routine generates a list of available ISMN stations and the EASEv2 grid point they are located in. """ paths = Paths() io = ISMN_Interface(paths.ismn_raw) # get metadata indices of all stations that measure soil moisture within the first 10 cm idx = io.get_dataset_ids('soil moisture', min_depth=0.0, max_depth=0.1) df = pd.DataFrame({'network': io.metadata[idx]['network'], 'station': io.metadata[idx]['station'], 'lat': io.metadata[idx]['latitude'], 'lon': io.metadata[idx]['longitude'], 'ease2_gpi': np.zeros(len(idx)).astype('int')}, index=idx) # merge indices for stations that have multiple sensors within the first 10 cm duplicate_idx = df.groupby(df.columns.tolist()).apply(lambda x: '-'.join(['%i'% i for i in x.index])).values df.drop_duplicates(inplace=True) df.index = duplicate_idx # create EASEv2 grid domain grid = EASE2() lons, lats = np.meshgrid(grid.ease_lons, grid.ease_lats) lons = lons.flatten() lats = lats.flatten() # find EASEv2 grid points in which the individual stations are located for i, (idx, data) in enumerate(df.iterrows()): print('%i / %i' % (i, len(df))) r = (lons - data.lon) ** 2 + (lats - data.lat) ** 2 df.loc[idx, 'ease2_gpi'] = np.where((r - r.min()) < 0.0001)[0][0] df.to_csv(paths.ismn / 'station_list.csv')
def generate_station_list(): paths = Paths() io = ISMN_Interface(paths.ismn / 'downloaded' / 'CONUS_20100101_20190101') # get metadata indices of all stations that measure soil moisture within the first 10 cm idx = io.get_dataset_ids('soil moisture', min_depth=0.0, max_depth=0.1) df = pd.DataFrame( { 'network': io.metadata[idx]['network'], 'station': io.metadata[idx]['station'], 'lat': io.metadata[idx]['latitude'], 'lon': io.metadata[idx]['longitude'], 'ease2_gpi': np.zeros(len(idx)).astype('int') }, index=idx) # merge indices for stations that have multiple sensors within the first 10 cm duplicate_idx = df.groupby(df.columns.tolist()).apply( lambda x: '-'.join(['%i' % i for i in x.index])).values df.drop_duplicates(inplace=True) df.index = duplicate_idx grid = EASE2() lons, lats = np.meshgrid(grid.ease_lons, grid.ease_lats) lons = lons.flatten() lats = lats.flatten() for i, (idx, data) in enumerate(df.iterrows()): print('%i / %i' % (i, len(df))) r = (lons - data.lon)**2 + (lats - data.lat)**2 df.loc[idx, 'ease2_gpi'] = np.where((r - r.min()) < 0.0001)[0][0] df.to_csv(paths.ismn / 'station_list.csv')
def resample_timeseries(): paths = Paths() io = ISMN_Interface(paths.ismn / 'downloaded' / 'CONUS_20100101_20190101') # get all stations / sensors for each grid cell. lut = pd.read_csv(paths.ismn / 'station_list.csv', index_col=0) lut = lut.groupby('ease2_gpi').apply( lambda x: '-'.join([i for i in x.index])) dir_out = paths.ismn / 'timeseries' for cnt, (gpi, indices) in enumerate(lut.iteritems()): print('%i / %i' % (cnt, len(lut))) fname = dir_out / ('%i.csv' % gpi) idx = indices.split('-') # Only one station within grid cell if len(idx) == 1: try: ts = io.read_ts(int(idx[0])) ts = ts[ts['soil moisture_flag'] == 'G']['soil moisture'] ts.tz_convert(None).to_csv(fname, float_format='%.4f') except: print('Corrupt file: ' + io.metadata[int(idx[0])]['filename']) # Multiple stations within grid cell else: df = [] for i in idx: try: ts = io.read_ts(int(i)) df += [ ts[ts['soil moisture_flag'] == 'G']['soil moisture'] ] except: print('Corrupt file: ' + io.metadata[int(i)]['filename']) if len(df) == 0: continue df = pd.concat(df, axis=1) df.columns = np.arange(len(df.columns)) # match temporal mean and standard deviation to those of the station with the maximum temporal coverage n = np.array([len(df[i].dropna()) for i in df]) ref = np.where(n == n.max())[0][0] for col in df: if col != ref: df[col] = (df[col] - df[col].mean()) / df[col].std( ) * df[ref].std() + df[ref].mean() # Average measurements of all stations df.mean(axis='columns').tz_convert(None).to_csv( fname, float_format='%.4f')
def main(_) -> None: # Make a call to the ISMN_Interface to create metadata required once. ISMN_Interface(_ISMN_DATA_PATH_FLAG.value, parallel=True) # Soil moisture data released with the paper is stored under # soil_moisture_retrieval_data/* under the eoscience-public GCS bucket. input_files = [ f"soil_moisture_retrieval_data/data-{idx:05d}-of-{_TOTAL_FILES:05d}.tfrecord.gz" for idx in range(_TOTAL_FILES) ] with multiprocessing.pool.Pool(_NUM_WORKERS_FLAG.value, initializer=init_worker, initargs=(add_sm_labels_to_data, )) as pool: stats_list = pool.starmap( add_sm_labels_to_data, zip(input_files, itertools.repeat(_OUTPUT_DIR_FLAG.value), itertools.repeat(_TEMP_STORAGE_DIR_FLAG.value))) pool.close() pool.join() print("Finished execution!") total_stats = list(map(sum, zip(*stats_list))) print( f"Total success: {total_stats[0]}, Total lookup failures: {total_stats[1]}, Total value failures: {total_stats[2]}" )
def setUpClass(cls): super(Test_ISMN_Interface_CeopUnzipped, cls).setUpClass() testdata = os.path.join(testdata_root, "Data_seperate_files_20170810_20180809") metadata_path = os.path.join(testdata, "python_metadata") cleanup(metadata_path) ds = ISMN_Interface(testdata, network=[], parallel=True) assert ds.networks == OrderedDict() cls.testdata = testdata
def setUpClass(cls): super(Test_ISMN_Interface_HeaderValuesZipped, cls).setUpClass() testdata_path = os.path.join(testdata_root, "zip_archives", "header") testdata_zip_path = os.path.join( testdata_path, "Data_seperate_files_header_20170810_20180809.zip") # clean up existing metadata metadata_path = os.path.join(testdata_path, "python_metadata") cleanup(metadata_path) ISMN_Interface(testdata_zip_path) cls.testdata_zip_path = testdata_zip_path
def setUpClass(cls): super(Test_ISMN_Interface_CeopZipped, cls).setUpClass() testdata_path = os.path.join(testdata_root, 'zip_archives', 'ceop') testdata_zip_path = os.path.join(testdata_path, 'Data_seperate_files_20170810_20180809.zip') # clean up existing metadata metadata_path = os.path.join(testdata_path, 'python_metadata') cleanup(metadata_path) ISMN_Interface(testdata_zip_path) cls.testdata_zip_path = testdata_zip_path
def create_reader(dataset, version): reader = None folder_name = path.join(dataset.storage_path, version.short_name) if dataset.short_name == globals.ISMN: reader = ISMN_Interface(folder_name) if dataset.short_name == globals.C3S: c3s_data_folder = path.join(folder_name, 'TCDR/063_images_to_ts/combined-daily') reader = c3s_read(c3s_data_folder, ioclass_kws={'read_bulk': True}) if (dataset.short_name == globals.CCI or dataset.short_name == globals.CCIA or dataset.short_name == globals.CCIP): reader = CCITs(folder_name, ioclass_kws={'read_bulk': True}) if dataset.short_name == globals.GLDAS: reader = GLDASTs(folder_name, ioclass_kws={'read_bulk': True}) if dataset.short_name == globals.SMAP: smap_data_folder = path.join(folder_name, 'netcdf') reader = SMAPTs(smap_data_folder, ioclass_kws={'read_bulk': True}) if dataset.short_name == globals.ASCAT: ascat_data_folder = path.join(folder_name, 'data') ascat_grid_path = first_file_in(path.join(folder_name, 'grid'), '.nc') fn_format = "{:04d}" reader = AscatNc(path=ascat_data_folder, fn_format=fn_format, grid_filename=ascat_grid_path, static_layer_path=None, ioclass_kws={'read_bulk': True}) if dataset.short_name == globals.SMOS: reader = SMOSTs(folder_name, ioclass_kws={'read_bulk': True}) if dataset.short_name == globals.ERA5: reader = ERATs(folder_name, ioclass_kws={'read_bulk': True}) if dataset.short_name == globals.ERA5_LAND: reader = ERATs(folder_name, ioclass_kws={'read_bulk': True}) if not reader: raise ValueError( "Reader for dataset '{}' not available".format(dataset)) reader = TimezoneAdapter(reader) return reader
def setUpClass(cls): super(Test_ISMN_Interface_HeaderValuesUnzipped, cls).setUpClass() testdata_path_unzipped = os.path.join( testdata_root, "Data_seperate_files_header_20170810_20180809") # clean existing metadata metadata_path = os.path.join(testdata_path_unzipped, "python_metadata") cleanup(metadata_path) ISMN_Interface(testdata_path_unzipped) cls.testdata = testdata_path_unzipped
def ismn_reader(): # Initialize ISMN reader ismn_data_folder = os.path.join( os.path.dirname(__file__), "..", "test-data", "ismn", "multinetwork", "header_values", ) ismn_reader = ISMN_Interface(ismn_data_folder) return ismn_reader
def __init__(self, path=None, col_offs=0, row_offs=0): self.col_offs = col_offs self.row_offs = row_offs if path is None: self.root = Path( '~/data_sets/ISMN/CONUS_20070101_20200101').expanduser() else: self.root = Path(path) self.io = ISMN_Interface(self.root) self.list_file = self.root / 'station_list.csv' if not self.list_file.exists(): print('Station list does not exist.') self.generate_station_list() else: self.list = pd.read_csv(self.list_file, index_col=0)
def test_metadata_dataframe(): # make sure that metadata.index represents same values as get_dataset_ids testdata = os.path.join(testdata_root, "Data_seperate_files_20170810_20180809") metadata_path = os.path.join(testdata, "python_metadata") cleanup(metadata_path) ds_one = ISMN_Interface(testdata, meta_path=metadata_path, network='FR_Aqui') assert np.all(ds_one.metadata.index.values == ds_one.get_dataset_ids( None, -np.inf, np.inf)) ids = ds_one.get_dataset_ids('soil_moisture') assert ids == ds_one.metadata.index.values assert ds_one.metadata.loc[ids[0], 'variable']['val'] == 'soil_moisture' assert ds_one.metadata.loc[ids[0], 'network']['val'] == 'FR_Aqui' ds_one.close_files()
def setUp(self) -> None: self.ds = ISMN_Interface(self.testdata, network=["COSMOS"])
def test_ascat_ismn_validation(): """ Test processing framework with some ISMN and ASCAT sample data """ ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', '55R22') ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', 'grid') static_layers_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'h_saf', 'static_layer') ascat_reader = AscatSsmCdr(ascat_data_folder, ascat_grid_folder, grid_filename='TUW_WARP5_grid_info_2_1.nc', static_layer_path=static_layers_folder) ascat_reader.read_bulk = True # Initialize ISMN reader ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'ismn', 'multinetwork', 'header_values') ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids( variable='soil moisture', min_depth=0, max_depth=0.1) for idx in ids: metadata = ismn_reader.metadata[idx] jobs.append((idx, metadata['longitude'], metadata['latitude'])) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { 'ISMN': { 'class': ismn_reader, 'columns': ['soil moisture'] }, 'ASCAT': { 'class': ascat_reader, 'columns': ['sm'], 'kwargs': {'mask_frozen_prob': 80, 'mask_snow_prob': 80, 'mask_ssf': True} }} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] process = Validation( datasets, 'ISMN', temporal_ref='ASCAT', scaling='lin_cdf_match', scaling_ref='ASCAT', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, period=period) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join( save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc') vars_should = [u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau', u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R'] n_obs_should = [384, 357, 482, 141, 251, 1927, 1887, 1652] rho_should = np.array([0.70022893, 0.53934574, 0.69356072, 0.84189808, 0.74206454, 0.30299741, 0.53143877, 0.62204134], dtype=np.float32) rmsd_should = np.array([7.72966719, 11.58347607, 14.57700157, 13.06224251, 12.90389824, 14.24668026, 21.19682884, 17.3883934], dtype=np.float32) with nc.Dataset(results_fname, mode='r') as results: assert sorted(results.variables.keys()) == sorted(vars_should) assert sorted(results.variables['n_obs'][:].tolist()) == sorted( n_obs_should) nptest.assert_allclose(sorted(rho_should), sorted(results.variables['rho'][:]), rtol=1e-4) nptest.assert_allclose(sorted(rmsd_should), sorted(results.variables['RMSD'][:]), rtol=1e-4)
ascat_reader = AscatSsmCdr(ascat_data_folder, ascat_grid_folder, grid_filename='TUW_WARP5_grid_info_2_1.nc', static_layer_path=static_layers_folder) ascat_reader.read_bulk = True # Initialize ISMN reader # In[4]: ismn_data_folder = os.path.join(testdata_folder, 'ismn/multinetwork/header_values') ismn_reader = ISMN_Interface(ismn_data_folder) # The validation is run based on jobs. A job consists of at least three lists or numpy arrays specifing the grid # point index, its latitude and longitude. In the case of the ISMN we can use the `dataset_ids` that identify every # time series in the downloaded ISMN data as our grid point index. We can then get longitude and latitude from the # metadata of the dataset. # # **DO NOT CHANGE** the name ***jobs*** because it will be searched during the parallel processing! # In[5]: jobs = [] ids = ismn_reader.get_dataset_ids(variable='soil moisture', min_depth=0, max_depth=0.1)
def resample_ismn(): """ This resamples ISMN data onto the EASE2 grid and stores data for each grid cell into .csv files. If single grid cells contain multiple stations, they are averaged. A grid look-up table needs to be created first (method: ancillary.grid.create_lut). """ paths = Paths() io = ISMN_Interface(paths.ismn_raw) # get all stations / sensors for each grid cell. lut = pd.read_csv(paths.ismn / 'station_list.csv',index_col=0) lut = lut.groupby('ease2_gpi').apply(lambda x: '-'.join([i for i in x.index])) dir_out = paths.ismn / 'timeseries' if not dir_out.exists(): dir_out.mkdir() for cnt, (gpi, indices) in enumerate(lut.iteritems()): print('%i / %i' % (cnt, len(lut))) fname = dir_out / ('%i.csv' % gpi) idx = indices.split('-') # Only one station within grid cell if len(idx) == 1: try: ts = io.read_ts(int(idx[0])) ts = ts[ts['soil moisture_flag'] == 'G']['soil moisture'] # Get only "good" data based on ISMN QC ts.tz_convert(None).to_csv(fname, float_format='%.4f') except: print('Corrupt file: ' + io.metadata[int(idx[0])]['filename']) # Multiple stations within grid cell else: df = [] for i in idx: try: ts = io.read_ts(int(i)) df += [ts[ts['soil moisture_flag'] == 'G']['soil moisture']] # Get only "good" data based on ISMN QC except: print('Corrupt file: ' + io.metadata[int(i)]['filename']) if len(df) == 0: continue df = pd.concat(df, axis=1) df.columns = np.arange(len(df.columns)) # match temporal mean and standard deviation to those of the station with the maximum temporal coverage n = np.array([len(df[i].dropna()) for i in df]) ref = np.where(n==n.max())[0][0] for col in df: if col != ref: df[col] = (df[col] - df[col].mean())/df[col].std() * df[ref].std() + df[ref].mean() # Average measurements of all stations df.mean(axis='columns').tz_convert(None).to_csv(fname, float_format='%.4f')
def test_ascat_ismn_validation_metadata_rolling(ascat_reader): """ Test processing framework with some ISMN and ASCAT sample data """ # Initialize ISMN reader ismn_data_folder = os.path.join( os.path.dirname(__file__), "..", "test-data", "ismn", "multinetwork", "header_values", ) ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids( variable="soil moisture", min_depth=0, max_depth=0.1 ) metadata_dict_template = { "network": np.array(["None"], dtype="U256"), "station": np.array(["None"], dtype="U256"), "landcover": np.float32([np.nan]), "climate": np.array(["None"], dtype="U4"), } for idx in ids: metadata = ismn_reader.metadata[idx] metadata_dict = [ { "network": metadata["network"], "station": metadata["station"], "landcover": metadata["landcover_2010"], "climate": metadata["climate"], } ] jobs.append( (idx, metadata["longitude"], metadata["latitude"], metadata_dict) ) save_path = tempfile.mkdtemp() # Create the validation object. datasets = { "ISMN": {"class": ismn_reader, "columns": ["soil moisture"]}, "ASCAT": { "class": ascat_reader, "columns": ["sm"], "kwargs": { "mask_frozen_prob": 80, "mask_snow_prob": 80, "mask_ssf": True, }, }, } read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager( datasets, "ISMN", period, read_ts_names=read_ts_names ) process = Validation( datasets, "ISMN", temporal_ref="ASCAT", scaling="lin_cdf_match", scaling_ref="ASCAT", metrics_calculators={ (2, 2): metrics_calculators.RollingMetrics( other_name="k1", metadata_template=metadata_dict_template ).calc_metrics }, period=period, ) for job in jobs: results = process.calc(*job) netcdf_results_manager( results, save_path, ts_vars=["R", "p_R", "RMSD"] ) results_fname = os.path.join( save_path, "ASCAT.sm_with_ISMN.soil moisture.nc" ) vars_should = [ u"gpi", u"lon", u"lat", u"R", u"p_R", u"time", u"idx", u"_row_size", ] for key, value in metadata_dict_template.items(): vars_should.append(key) network_should = np.array( [ "MAQU", "MAQU", "SCAN", "SCAN", "SCAN", "SOILSCAPE", "SOILSCAPE", "SOILSCAPE", ], dtype="U256", ) reader = PointDataResults(results_fname, read_only=True) df = reader.read_loc(None) nptest.assert_equal(sorted(network_should), sorted(df["network"].values)) assert np.all(df.gpi.values == np.arange(8)) assert reader.read_ts(0).index.size == 357 assert np.all( reader.read_ts(1).columns.values == np.array(["R", "p_R", "RMSD"]) )
def test_ascat_ismn_validation_metadata(ascat_reader): """ Test processing framework with some ISMN and ASCAT sample data """ # Initialize ISMN reader ismn_data_folder = os.path.join( os.path.dirname(__file__), "..", "test-data", "ismn", "multinetwork", "header_values", ) ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids( variable="soil moisture", min_depth=0, max_depth=0.1 ) metadata_dict_template = { "network": np.array(["None"], dtype="U256"), "station": np.array(["None"], dtype="U256"), "landcover": np.float32([np.nan]), "climate": np.array(["None"], dtype="U4"), } for idx in ids: metadata = ismn_reader.metadata[idx] metadata_dict = [ { "network": metadata["network"], "station": metadata["station"], "landcover": metadata["landcover_2010"], "climate": metadata["climate"], } ] jobs.append( (idx, metadata["longitude"], metadata["latitude"], metadata_dict) ) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { "ISMN": { "class": ismn_reader, "columns": ["soil moisture"], }, "ASCAT": { "class": ascat_reader, "columns": ["sm"], "kwargs": { "mask_frozen_prob": 80, "mask_snow_prob": 80, "mask_ssf": True, }, }, } read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager( datasets, "ISMN", period, read_ts_names=read_ts_names ) process = Validation( datasets, "ISMN", temporal_ref="ASCAT", scaling="lin_cdf_match", scaling_ref="ASCAT", metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics( other_name="k1", metadata_template=metadata_dict_template ).calc_metrics }, period=period, ) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join( save_path, "ASCAT.sm_with_ISMN.soil moisture.nc" ) vars_should = [ u"n_obs", u"tau", u"gpi", u"RMSD", u"lon", u"p_tau", u"BIAS", u"p_rho", u"rho", u"lat", u"R", u"p_R", u"time", u"idx", u"_row_size", ] for key, value in metadata_dict_template.items(): vars_should.append(key) n_obs_should = [357, 384, 1646, 1875, 1915, 467, 141, 251] rho_should = np.array( [ 0.53934574, 0.7002289, 0.62200236, 0.53647155, 0.30413666, 0.6740655, 0.8418981, 0.74206454, ], dtype=np.float32, ) rmsd_should = np.array( [ 11.583476, 7.729667, 17.441547, 21.125721, 14.31557, 14.187225, 13.0622425, 12.903898, ], dtype=np.float32, ) network_should = np.array( [ "MAQU", "MAQU", "SCAN", "SCAN", "SCAN", "SOILSCAPE", "SOILSCAPE", "SOILSCAPE", ], dtype="U256", ) with nc.Dataset(results_fname, mode="r") as results: vars = results.variables.keys() n_obs = results.variables["n_obs"][:].tolist() rho = results.variables["rho"][:] rmsd = results.variables["RMSD"][:] network = results.variables["network"][:] assert sorted(vars) == sorted(vars_should) assert sorted(n_obs) == sorted(n_obs_should) nptest.assert_allclose(sorted(rho), sorted(rho_should), rtol=1e-4) nptest.assert_allclose(sorted(rmsd), sorted(rmsd_should), rtol=1e-4) nptest.assert_equal(sorted(network), sorted(network_should))
class Test_ISMN_Interface_CeopUnzipped(unittest.TestCase): @classmethod def setUpClass(cls): super(Test_ISMN_Interface_CeopUnzipped, cls).setUpClass() testdata = os.path.join(testdata_root, 'Data_seperate_files_20170810_20180809') metadata_path = os.path.join(testdata, 'python_metadata') cleanup(metadata_path) ds = ISMN_Interface(testdata, network=[]) assert ds.networks == OrderedDict() cls.testdata = testdata def setUp(self) -> None: self.ds = ISMN_Interface(self.testdata, network=['COSMOS']) def tearDown(self) -> None: self.ds.close_files() logging.shutdown() def test_list(self): assert len(self.ds.list_networks()) == 1 assert len(self.ds.list_stations()) == len(self.ds.list_stations('COSMOS')) == 2 assert len(self.ds.list_sensors()) == 2 assert len(self.ds.list_sensors(station='Barrow-ARM')) == 1 def test_network_for_station(self): assert self.ds.network_for_station('Barrow-ARM') == 'COSMOS' assert self.ds.network_for_station('ARM-1') == 'COSMOS' def test_stations_that_measure(self): for s in self.ds.stations_that_measure('soil_moisture'): assert s.name in ['ARM-1', 'Barrow-ARM'] for s in self.ds.stations_that_measure('nonexisting'): raise AssertionError("Found var that doesnt exist") def test_get_dataset_ids(self): ids = self.ds.get_dataset_ids('soil_moisture', max_depth=100, groupby='network') assert list(ids.keys()) == ['COSMOS'] assert ids['COSMOS'] == [0,1] ids = self.ds.get_dataset_ids('soil_moisture', max_depth=0.19) assert ids == [0] ids = self.ds.get_dataset_ids('soil_moisture', max_depth=99, filter_meta_dict={'lc_2010': 210, 'network': 'COSMOS', 'station': 'Barrow-ARM'}) assert ids == [1] ids = self.ds.get_dataset_ids('novar') assert len(ids) == 0 ids = self.ds.get_dataset_ids('soil_moisture', 0., 0.19) # should get 1 assert len(ids) == 1 ids = self.ds.get_dataset_ids('soil_moisture', 0., 1.) # should get 2 assert len(ids) == 2 ids = self.ds.get_dataset_ids('soil_moisture', 0., 1., filter_meta_dict={'lc_2010': 210}) # should get 1 assert len(ids) == 1 ids = self.ds.get_dataset_ids('nonexisting') # should get 0 assert len(ids) == 0 def test_read_ts(self): data1 = self.ds.read(0) assert not data1.empty data2 = self.ds.read_ts(1) assert not data2.empty assert len(data1.index) != len(data2.index) # make sure they are not same def test_find_nearest_station(self): should_lon, should_lat = -156.62870, 71.32980 station = self.ds.find_nearest_station(should_lon, should_lat) assert station.lon == should_lon assert station.lat == should_lat def test_plot_station_locations(self): with TemporaryDirectory() as out_dir: outpath = os.path.join(out_dir, 'plot.png') self.ds.plot_station_locations('soil_moisture', markersize=5, filename=outpath) assert len(os.listdir(out_dir)) == 1 def test_get_min_max_obs_timestamps(self): tmin, tmax = self.ds.get_min_max_obs_timestamps('soil_moisture', max_depth=0.19) assert tmin == datetime(2017, 8, 10, 0) assert tmax == datetime(2018, 8, 9, 23) def test_get_min_max_obs_timestamps_for_station(self): station = self.ds.collection.networks['COSMOS'].stations['ARM-1'] tmin, tmax = station.get_min_max_obs_timestamp('soil_moisture', 0, 0.19) assert tmin == datetime(2017, 8, 10, 0) assert tmax == datetime(2018, 8, 9, 23) def test_get_static_var_val(self): vals = self.ds.get_static_var_vals('soil_moisture', max_depth=0.19) assert vals == {130: 'Grassland'} vals = self.ds.get_landcover_types('soil_moisture', max_depth=100) assert len(vals) == 2 assert vals[130] == 'Grassland' assert vals[210] == 'Water' self.ds.print_landcover_dict() vals = self.ds.get_climate_types('soil_moisture', max_depth=100, climate='climate_KG') assert len(vals) == 2 assert vals['ET'] == 'Polar Tundra' assert vals['Cfa'] == 'Temperate Without Dry Season, Hot Summer' self.ds.print_climate_dict() def test_get_var(self): vars = self.ds.get_variables() assert vars == ['soil_moisture'] def test_get_sensors(self): i = 0 for nw, station in self.ds.collection.iter_stations( filter_meta_dict={'network': 'COSMOS'}): for se in station.iter_sensors(): data = se.read_data() # check if the networks is COSMOS or station in [ARM, Barrow-ARM] assert not data.empty # check something for that one station i += 1 assert i == 2 i = 0 for se in self.ds.networks['COSMOS'].stations['Barrow-ARM'].iter_sensors(): data = se.read_data() assert not data.empty # check something for that one station i += 1 assert i == 1 i = 0 for net, stat, sens in self.ds.collection.iter_sensors( depth=Depth(0,1), filter_meta_dict={'station': ['Barrow-ARM', 'ARM-1']}): data = sens.read_data() assert not data.empty i +=1 assert i == 2 for nw, station in self.ds.collection.iter_stations(): for se in station.iter_sensors(variable='nonexisting'): raise ValueError("Found sensor, although none should exist") def test_get_nearest_station(self): should_lon, should_lat = -156.62870, 71.32980 station, dist = self.ds.collection.get_nearest_station(should_lon, should_lat) assert dist == 0 assert station.lon == should_lon assert station.lat == should_lat gpi, dist = self.ds.collection.grid.find_nearest_gpi(int(should_lon),int(should_lat)) assert dist != 0 for net in self.ds.collection.iter_networks(): if station.name in net.stations.keys(): assert net.stations[station.name].lon == should_lon assert net.stations[station.name].lat == should_lat station, dist = self.ds.find_nearest_station(0, 0, return_distance=True, max_dist=100) assert station == dist == None
def test_ascat_ismn_validation_metadata_rolling(): """ Test processing framework with some ISMN and ASCAT sample data """ ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', '55R22') ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', 'grid') static_layers_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'h_saf', 'static_layer') ascat_reader = AscatSsmCdr(ascat_data_folder, ascat_grid_folder, grid_filename='TUW_WARP5_grid_info_2_1.nc', static_layer_path=static_layers_folder) ascat_reader.read_bulk = True # Initialize ISMN reader ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'ismn', 'multinetwork', 'header_values') ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids(variable='soil moisture', min_depth=0, max_depth=0.1) metadata_dict_template = { 'network': np.array(['None'], dtype='U256'), 'station': np.array(['None'], dtype='U256'), 'landcover': np.float32([np.nan]), 'climate': np.array(['None'], dtype='U4') } for idx in ids: metadata = ismn_reader.metadata[idx] metadata_dict = [{ 'network': metadata['network'], 'station': metadata['station'], 'landcover': metadata['landcover_2010'], 'climate': metadata['climate'] }] jobs.append( (idx, metadata['longitude'], metadata['latitude'], metadata_dict)) save_path = tempfile.mkdtemp() # Create the validation object. datasets = { 'ISMN': { 'class': ismn_reader, 'columns': ['soil moisture'] }, 'ASCAT': { 'class': ascat_reader, 'columns': ['sm'], 'kwargs': { 'mask_frozen_prob': 80, 'mask_snow_prob': 80, 'mask_ssf': True } } } read_ts_names = {'ASCAT': 'read', 'ISMN': 'read_ts'} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager(datasets, 'ISMN', period, read_ts_names=read_ts_names) process = Validation( datasets, 'ISMN', temporal_ref='ASCAT', scaling='lin_cdf_match', scaling_ref='ASCAT', metrics_calculators={ (2, 2): metrics_calculators.RollingMetrics( other_name='k1', metadata_template=metadata_dict_template).calc_metrics }, period=period) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path, ts_vars=['R', 'p_R', 'RMSD']) results_fname = os.path.join(save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc') vars_should = [ u'gpi', u'lon', u'lat', u'R', u'p_R', u'time', u'idx', u'_row_size' ] for key, value in metadata_dict_template.items(): vars_should.append(key) network_should = np.array([ 'MAQU', 'MAQU', 'SCAN', 'SCAN', 'SCAN', 'SOILSCAPE', 'SOILSCAPE', 'SOILSCAPE' ], dtype='U256') reader = PointDataResults(results_fname, read_only=True) df = reader.read_loc(None) nptest.assert_equal(sorted(network_should), sorted(df['network'].values)) assert np.all(df.gpi.values == np.arange(8)) assert (reader.read_ts(0).index.size == 357) assert np.all( reader.read_ts(1).columns.values == np.array(['R', 'p_R', 'RMSD']))
def test_ascat_ismn_validation(): """ Test processing framework with some ISMN and ASCAT sample data """ ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', '55R22') ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', 'grid') static_layers_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'h_saf', 'static_layer') ascat_reader = AscatSsmCdr(ascat_data_folder, ascat_grid_folder, grid_filename='TUW_WARP5_grid_info_2_1.nc', static_layer_path=static_layers_folder) ascat_reader.read_bulk = True # Initialize ISMN reader ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'ismn', 'multinetwork', 'header_values') ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids(variable='soil moisture', min_depth=0, max_depth=0.1) for idx in ids: metadata = ismn_reader.metadata[idx] jobs.append((idx, metadata['longitude'], metadata['latitude'])) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { 'ISMN': { 'class': ismn_reader, 'columns': ['soil moisture'] }, 'ASCAT': { 'class': ascat_reader, 'columns': ['sm'], 'kwargs': { 'mask_frozen_prob': 80, 'mask_snow_prob': 80, 'mask_ssf': True } } } read_ts_names = {'ASCAT': 'read', 'ISMN': 'read_ts'} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager(datasets, 'ISMN', period, read_ts_names=read_ts_names) process = Validation( datasets, 'ISMN', temporal_ref='ASCAT', scaling='lin_cdf_match', scaling_ref='ASCAT', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics }, period=period) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join(save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc') vars_should = [ u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau', u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R', u'time', u'idx', u'_row_size' ] n_obs_should = [384, 357, 482, 141, 251, 1927, 1887, 1652] rho_should = np.array([ 0.70022893, 0.53934574, 0.69356072, 0.84189808, 0.74206454, 0.30299741, 0.53143877, 0.62204134 ], dtype=np.float32) rmsd_should = np.array([ 7.72966719, 11.58347607, 14.57700157, 13.06224251, 12.90389824, 14.24668026, 21.19682884, 17.3883934 ], dtype=np.float32) with nc.Dataset(results_fname, mode='r') as results: assert sorted(list(results.variables.keys())) == sorted(vars_should) assert sorted( results.variables['n_obs'][:].tolist()) == sorted(n_obs_should) nptest.assert_allclose(sorted(rho_should), sorted(results.variables['rho'][:]), rtol=1e-4) nptest.assert_allclose(sorted(rmsd_should), sorted(results.variables['RMSD'][:]), rtol=1e-4)
def setUp(self) -> None: self.ds = ISMN_Interface(self.testdata_zip_path)
class Test_ISMN_Interface_CeopUnzipped(unittest.TestCase): @classmethod def setUpClass(cls): super(Test_ISMN_Interface_CeopUnzipped, cls).setUpClass() testdata = os.path.join(testdata_root, "Data_seperate_files_20170810_20180809") metadata_path = os.path.join(testdata, "python_metadata") cleanup(metadata_path) ds = ISMN_Interface(testdata, network=[], parallel=True) assert ds.networks == OrderedDict() cls.testdata = testdata def setUp(self) -> None: self.ds = ISMN_Interface(self.testdata, network=["COSMOS"]) def tearDown(self) -> None: self.ds.close_files() logging.shutdown() def test_list(self): with pytest.deprecated_call(): assert len(self.ds.list_networks()) == 1 assert len(self.ds.list_stations()) == len( self.ds.list_stations("COSMOS")) == 2 assert len(self.ds.list_sensors()) == 2 assert len(self.ds.list_sensors(station="Barrow-ARM")) == 1 def test_network_for_station(self): assert self.ds.network_for_station("Barrow-ARM") == "COSMOS" assert self.ds.network_for_station("ARM-1") == "COSMOS" def test_stations_that_measure(self): for s in self.ds.stations_that_measure("soil_moisture"): assert s.name in ["ARM-1", "Barrow-ARM"] for s in self.ds.stations_that_measure("nonexisting"): raise AssertionError("Found var that doesnt exist") def test_get_dataset_ids(self): ids = self.ds.get_dataset_ids("soil_moisture", max_depth=100, groupby="network") assert list(ids.keys()) == ["COSMOS"] assert ids["COSMOS"] == [0, 1] ids = self.ds.get_dataset_ids("soil_moisture", max_depth=0.19) assert ids == [0] ids = self.ds.get_dataset_ids( ["soil_moisture"], max_depth=99, filter_meta_dict={ "lc_2010": 210, "network": "COSMOS", "station": "Barrow-ARM", }, ) assert ids == [1] ids = self.ds.get_dataset_ids("novar") assert len(ids) == 0 ids = self.ds.get_dataset_ids(["soil_moisture", "shouldhavenoeffect"], 0.0, 0.19) # should get 1 assert len(ids) == 1 ids = self.ds.get_dataset_ids("soil_moisture", 0.0, 1.0) # should get 2 assert len(ids) == 2 ids = self.ds.get_dataset_ids("soil_moisture", 0.0, 1.0, filter_meta_dict={"lc_2010": 210}) # should get 1 assert len(ids) == 1 ids = self.ds.get_dataset_ids("nonexisting") # should get 0 assert len(ids) == 0 def test_read_ts(self): data1 = self.ds.read(0) assert not data1.empty data2, meta = self.ds.read_ts(1, return_meta=True) assert not data2.empty def test_read_metadata(self): data2, meta = self.ds.read_ts(1, return_meta=True) assert all(meta == self.ds.read_metadata(1, format="pandas")) assert self.ds.read_metadata(1, format="dict") is not None assert self.ds.read_metadata([1], format="obj") is not None assert not self.ds.metadata.empty assert self.ds.metadata.loc[1]['station']['val'] \ == self.ds.read_metadata([0,1]).loc[1, ('station', 'val')] def test_find_nearest_station(self): should_lon, should_lat = -156.62870, 71.32980 station = self.ds.find_nearest_station(should_lon, should_lat) assert station.lon == should_lon assert station.lat == should_lat def test_plot_station_locations(self): with TemporaryDirectory() as out_dir: outpath = os.path.join(out_dir, "plot.png") self.ds.plot_station_locations(["soil_moisture", 'precipitation'], markersize=5, filename=outpath) assert len(os.listdir(out_dir)) == 1 def test_get_min_max_obs_timestamps(self): tmin, tmax = self.ds.get_min_max_obs_timestamps("soil_moisture", max_depth=0.19) assert tmin == datetime(2017, 8, 10, 0) assert tmax == datetime(2018, 8, 9, 23) def test_get_min_max_obs_timestamps_for_station(self): station = self.ds.collection.networks["COSMOS"].stations["ARM-1"] tmin, tmax = station.get_min_max_obs_timestamp("soil_moisture", 0, 0.19) assert tmin == datetime(2017, 8, 10, 0) assert tmax == datetime(2018, 8, 9, 23) def test_get_static_var_val(self): vals = self.ds.get_static_var_vals("soil_moisture", max_depth=0.19) assert vals == {130: "Grassland"} vals = self.ds.get_landcover_types("soil_moisture", max_depth=100) assert len(vals) == 2 assert vals[130] == "Grassland" assert vals[210] == "Water" self.ds.print_landcover_dict() vals = self.ds.get_climate_types("soil_moisture", max_depth=100, climate="climate_KG") assert len(vals) == 2 assert vals["ET"] == "Polar Tundra" assert vals["Cfa"] == "Temperate Without Dry Season, Hot Summer" self.ds.print_climate_dict() def test_get_var(self): vars = self.ds.get_variables() assert vars == ["soil_moisture"] def test_get_sensors(self): i = 0 for nw, station in self.ds.collection.iter_stations( filter_meta_dict={"network": "COSMOS"}): for se in station.iter_sensors(): data = se.read_data() # check if the networks is COSMOS or station in [ARM, Barrow-ARM] assert not data.empty # check something for that one station i += 1 assert i == 2 i = 0 for se in self.ds.networks["COSMOS"].stations[ "Barrow-ARM"].iter_sensors(): data = se.read_data() assert not data.empty # check something for that one station i += 1 assert i == 1 i = 0 for net, stat, sens in self.ds.collection.iter_sensors( depth=Depth(0, 1), filter_meta_dict={"station": ["Barrow-ARM", "ARM-1"]}, ): data = sens.read_data() assert not data.empty i += 1 assert i == 2 for nw, station in self.ds.collection.iter_stations(): for se in station.iter_sensors(variable="nonexisting"): raise ValueError("Found sensor, although none should exist") def test_get_nearest_station(self): should_lon, should_lat = -156.62870, 71.32980 station, dist = self.ds.collection.get_nearest_station( should_lon, should_lat) assert dist == 0 assert station.lon == should_lon assert station.lat == should_lat gpi, dist = self.ds.collection.grid.find_nearest_gpi( int(should_lon), int(should_lat)) assert dist != 0 for net in self.ds.collection.iter_networks(): if station.name in net.stations.keys(): assert net.stations[station.name].lon == should_lon assert net.stations[station.name].lat == should_lat station, dist = self.ds.find_nearest_station(0, 0, return_distance=True, max_dist=100) assert station == dist == None def test_citation(self): with TemporaryDirectory() as out_dir: out_file = os.path.join(out_dir, 'citation.txt') refs = self.ds.collection.export_citations(out_file=out_file) assert all([ net in refs.keys() for net in list(self.ds.collection.networks.keys()) ]) assert os.path.exists(out_file) with open(out_file, mode='r') as f: lines = f.readlines() assert len(lines) > 0
static_layers_folder = os.path.join(testdata_folder, 'sat/h_saf/static_layer') ascat_reader = AscatSsmCdr(ascat_data_folder, ascat_grid_folder, grid_filename='TUW_WARP5_grid_info_2_1.nc', static_layer_path=static_layers_folder) ascat_reader.read_bulk = True # Initialize ISMN reader # In[4]: ismn_data_folder = os.path.join(testdata_folder, 'ismn/multinetwork/header_values') ismn_reader = ISMN_Interface(ismn_data_folder) # The validation is run based on jobs. A job consists of at least three lists or numpy arrays specifing the grid # point index, its latitude and longitude. In the case of the ISMN we can use the `dataset_ids` that identify every # time series in the downloaded ISMN data as our grid point index. We can then get longitude and latitude from the # metadata of the dataset. # # **DO NOT CHANGE** the name ***jobs*** because it will be searched during the parallel processing! # In[5]: jobs = [] ids = ismn_reader.get_dataset_ids(variable='soil moisture', min_depth=0, max_depth=0.1)
def init_worker(function: Any): """Sets up a worker with the GCS client and the ISMN Data.""" function.gcs_client = storage.Client.create_anonymous_client() function.ismn_data = ISMN_Interface(_ISMN_DATA_PATH_FLAG.value, parallel=True)