def test_validation_n3_k2_temporal_matching_no_matches(): tst_results = {} datasets = setup_two_without_overlap() dm = DataManager( datasets, "DS1", read_ts_names={d: "read" for d in ["DS1", "DS2", "DS3"]}, ) process = Validation( dm, "DS1", temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0 ).combinatory_matcher, scaling="lin_cdf_match", metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics( other_name="k1" ).calc_metrics }, ) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_missing_data(): n_datasets = 5 npoints = 5 nsamples = 100 datasets = create_datasets(n_datasets, npoints, nsamples, missing=True) metric_calculator = PairwiseIntercomparisonMetrics() val = Validation( datasets, spatial_ref="0-ERA5", metrics_calculators={(n_datasets, 2): metric_calculator.calc_metrics}, temporal_matcher=make_combined_temporal_matcher(pd.Timedelta(12, "H")), ) gpis = list(range(npoints)) val.calc(gpis, gpis, gpis, rename_cols=False, only_with_temporal_ref=True)
def test_validation_n3_k2_temporal_matching_no_matches2(): tst_results = { (("DS1", "x"), ("DS3", "y")): { "n_obs": np.array([1000], dtype=np.int32), "tau": np.array([np.nan], dtype=np.float32), "gpi": np.array([4], dtype=np.int32), "RMSD": np.array([0.0], dtype=np.float32), "lon": np.array([4.0]), "p_tau": np.array([np.nan], dtype=np.float32), "BIAS": np.array([0.0], dtype=np.float32), "p_rho": np.array([0.0], dtype=np.float32), "rho": np.array([1.0], dtype=np.float32), "lat": np.array([4.0]), "R": np.array([1.0], dtype=np.float32), "p_R": np.array([0.0], dtype=np.float32), }, (("DS1", "x"), ("DS3", "x")): { "n_obs": np.array([1000], dtype=np.int32), "tau": np.array([np.nan], dtype=np.float32), "gpi": np.array([4], dtype=np.int32), "RMSD": np.array([0.0], dtype=np.float32), "lon": np.array([4.0]), "p_tau": np.array([np.nan], dtype=np.float32), "BIAS": np.array([0.0], dtype=np.float32), "p_rho": np.array([0.0], dtype=np.float32), "rho": np.array([1.0], dtype=np.float32), "lat": np.array([4.0]), "R": np.array([1.0], dtype=np.float32), "p_R": np.array([0.0], dtype=np.float32), }, } datasets = setup_three_with_two_overlapping() dm = DataManager( datasets, "DS1", read_ts_names={d: "read" for d in ["DS1", "DS2", "DS3"]}, ) process = Validation( dm, "DS1", temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0 ).combinatory_matcher, scaling="lin_cdf_match", metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics( other_name="k1" ).calc_metrics }, ) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_validation_n3_k2(): tst_results = { (('DS1', 'x'), ('DS3', 'y')): { 'n_obs': np.array([1000], dtype=np.int32), 'tau': np.array([np.nan], dtype=np.float32), 'gpi': np.array([4], dtype=np.int32), 'RMSD': np.array([0.], dtype=np.float32), 'lon': np.array([4.]), 'p_tau': np.array([np.nan], dtype=np.float32), 'BIAS': np.array([0.], dtype=np.float32), 'p_rho': np.array([0.], dtype=np.float32), 'rho': np.array([1.], dtype=np.float32), 'lat': np.array([4.]), 'R': np.array([1.], dtype=np.float32), 'p_R': np.array([0.], dtype=np.float32)}, (('DS1', 'x'), ('DS2', 'y')): { 'n_obs': np.array([1000], dtype=np.int32), 'tau': np.array([np.nan], dtype=np.float32), 'gpi': np.array([4], dtype=np.int32), 'RMSD': np.array([0.], dtype=np.float32), 'lon': np.array([4.]), 'p_tau': np.array([np.nan], dtype=np.float32), 'BIAS': np.array([0.], dtype=np.float32), 'p_rho': np.array([0.], dtype=np.float32), 'rho': np.array([1.], dtype=np.float32), 'lat': np.array([4.]), 'R': np.array([1.], dtype=np.float32), 'p_R': np.array([0.], dtype=np.float32)}, (('DS1', 'x'), ('DS3', 'x')): { 'n_obs': np.array([1000], dtype=np.int32), 'tau': np.array([np.nan], dtype=np.float32), 'gpi': np.array([4], dtype=np.int32), 'RMSD': np.array([0.], dtype=np.float32), 'lon': np.array([4.]), 'p_tau': np.array([np.nan], dtype=np.float32), 'BIAS': np.array([0.], dtype=np.float32), 'p_rho': np.array([0.], dtype=np.float32), 'rho': np.array([1.], dtype=np.float32), 'lat': np.array([4.]), 'R': np.array([1.], dtype=np.float32), 'p_R': np.array([0.], dtype=np.float32)}} datasets = setup_TestDatasets() process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_validation_n3_k2(): tst_results = { (('DS1', 'x'), ('DS3', 'y')): { 'n_obs': np.array([1000], dtype=np.int32), 'tau': np.array([np.nan], dtype=np.float32), 'gpi': np.array([4], dtype=np.int32), 'RMSD': np.array([0.], dtype=np.float32), 'lon': np.array([4.]), 'p_tau': np.array([np.nan], dtype=np.float32), 'BIAS': np.array([0.], dtype=np.float32), 'p_rho': np.array([0.], dtype=np.float32), 'rho': np.array([1.], dtype=np.float32), 'lat': np.array([4.]), 'R': np.array([1.], dtype=np.float32), 'p_R': np.array([0.], dtype=np.float32)}, (('DS1', 'x'), ('DS2', 'y')): { 'n_obs': np.array([1000], dtype=np.int32), 'tau': np.array([np.nan], dtype=np.float32), 'gpi': np.array([4], dtype=np.int32), 'RMSD': np.array([0.], dtype=np.float32), 'lon': np.array([4.]), 'p_tau': np.array([np.nan], dtype=np.float32), 'BIAS': np.array([0.], dtype=np.float32), 'p_rho': np.array([0.], dtype=np.float32), 'rho': np.array([1.], dtype=np.float32), 'lat': np.array([4.]), 'R': np.array([1.], dtype=np.float32), 'p_R': np.array([0.], dtype=np.float32)}, (('DS1', 'x'), ('DS3', 'x')): { 'n_obs': np.array([1000], dtype=np.int32), 'tau': np.array([np.nan], dtype=np.float32), 'gpi': np.array([4], dtype=np.int32), 'RMSD': np.array([0.], dtype=np.float32), 'lon': np.array([4.]), 'p_tau': np.array([np.nan], dtype=np.float32), 'BIAS': np.array([0.], dtype=np.float32), 'p_rho': np.array([0.], dtype=np.float32), 'rho': np.array([1.], dtype=np.float32), 'lat': np.array([4.]), 'R': np.array([1.], dtype=np.float32), 'p_R': np.array([0.], dtype=np.float32)}} datasets = setup_TestDatasets() process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_validation_n3_k2_masking_no_data_remains(): datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid(np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4])) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { 'masking1': { 'class': mds1, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 500}, 'use_lut': False, 'grids_compatible': True}, 'masking2': { 'class': mds2, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 1000}, 'use_lut': False, 'grids_compatible': True} } process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, masking_datasets=mds) gpi_info = (1, 1, 1) ref_df = datasets['DS1']['class'].read(1) with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 0 nptest.assert_allclose(new_ref_df.x.values, np.arange(1000, 1000)) jobs = process.get_processing_jobs() for job in jobs: with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) results = process.calc(*job) tst = [] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal(results[key]['n_obs'], tst[tst_key]['n_obs'])
def test_validation_n3_k2_masking_no_data_remains(): datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid(np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4])) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { 'masking1': { 'class': mds1, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 500}, 'use_lut': False, 'grids_compatible': True}, 'masking2': { 'class': mds2, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 1000}, 'use_lut': False, 'grids_compatible': True} } process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, masking_datasets=mds) gpi_info = (1, 1, 1) ref_df = datasets['DS1']['class'].read_ts(1) new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 0 nptest.assert_allclose(new_ref_df.x.values, np.arange(1000, 1000)) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) tst = [] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal(results[key]['n_obs'], tst[tst_key]['n_obs'])
def test_validation_n2_k2_temporal_matching_no_matches(): tst_results = {} datasets = setup_two_without_overlap() process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_validation_n2_k2_temporal_matching_no_matches(): tst_results = {} datasets = setup_two_without_overlap() process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) assert sorted(list(results)) == sorted(list(tst_results))
def test_TripleCollocationMetrics(testdata_generator): # tests by comparison of pairwise metrics to triplet metrics datasets, expected = testdata_generator() refname = "reference_name" othernames = list(datasets.keys()) othernames.remove(refname) triplet_metrics_calculator = TripleCollocationMetrics(refname, bootstrap_cis=False) matcher = make_combined_temporal_matcher(pd.Timedelta(6, "H")) val_triplet = Validation( datasets, "reference_name", scaling=None, # doesn't work with the constant test data temporal_matcher=matcher, metrics_calculators={(4, 3): triplet_metrics_calculator.calc_metrics}) results_triplet = val_triplet.calc([1], [1], [1], rename_cols=False) if "col1_name" in datasets.keys(): # we only test the TCA results with the random data, since for the # constant data all covariances are zero and TCA therefore doesn't # work. for metric in ["snr", "err_std", "beta"]: for dset in datasets: values = [] dkey = (dset, datasets[dset]["columns"][0]) for tkey in results_triplet: if dkey in tkey: values.append(results_triplet[tkey][(metric, dset)][0]) diff = np.abs(np.diff(values)) assert diff.max() / values[0] < 0.1 # check if writing to file works results_path = Path("__test_results") # if this throws, there's either some data left over from previous tests, # or some data is named __test_results. Remove the __test_results directory # from your current directory to make the test work again. assert not results_path.exists() results_path.mkdir(exist_ok=True, parents=True) netcdf_results_manager(results_triplet, results_path.name) assert results_path.exists() for key in results_triplet: fname = "_with_".join(map(lambda t: ".".join(t), key)) + ".nc" assert (results_path / fname).exists() # res = xr.open_dataset(results_path / fname) # for metric in ["snr", "err_std", "beta"]: # for dset, _ in key: # mkey = metric + "__" + dset # assert mkey in res.data_vars shutil.rmtree(results_path) # now with CIs, again only for random data if "col1_name" in datasets.keys(): triplet_metrics_calculator = TripleCollocationMetrics( refname, bootstrap_cis=True) val_triplet = Validation( datasets, "reference_name", scaling=None, # doesn't work with the constant test data temporal_matcher=matcher, metrics_calculators={ (4, 3): triplet_metrics_calculator.calc_metrics }) results_triplet = val_triplet.calc([1], [1], [1], rename_cols=False) for key in results_triplet: for dset, _ in key: for metric in ["snr", "err_std", "beta"]: lkey = f"{metric}_ci_lower" ukey = f"{metric}_ci_upper" assert (lkey, dset) in results_triplet[key] assert (ukey, dset) in results_triplet[key] assert (results_triplet[key][(lkey, dset)] <= results_triplet[key][(metric, dset)]) assert (results_triplet[key][(metric, dset)] <= results_triplet[key][(ukey, dset)])
def test_PairwiseIntercomparisonMetrics(testdata_generator): # This test first compares the PairwiseIntercomparisonMetrics to known # results and then confirms that it agrees with IntercomparisonMetrics as # expected datasets, expected = testdata_generator() # for the pairwise intercomparison metrics it's important that we use # make_combined_temporal_matcher val = Validation( datasets, "reference_name", scaling=None, # doesn't work with the constant test data temporal_matcher=make_combined_temporal_matcher(pd.Timedelta(6, "H")), metrics_calculators={ (4, 2): (PairwiseIntercomparisonMetrics(calc_spearman=True, analytical_cis=False).calc_metrics) }) results_pw = val.calc([1], [1], [1], rename_cols=False) # in results_pw, there are four entries with keys (("c1name", "c1"), # ("refname", "ref"), and so on. # Each value is a single dictionary with the values of the metrics expected_metrics = [ "R", "p_R", "BIAS", "RMSD", "mse", "RSS", "mse_corr", "mse_bias", "urmsd", "mse_var", "n_obs", "gpi", "lat", "lon", "rho", "p_rho", "tau", "p_tau" ] for key in results_pw: assert isinstance(key, tuple) assert len(key) == 2 assert all(map(lambda x: isinstance(x, tuple), key)) assert isinstance(results_pw[key], dict) assert sorted(expected_metrics) == sorted(results_pw[key].keys()) for m in expected_metrics: if m in expected[key]: assert_equal(results_pw[key][m], expected[key][m]) # preparation of IntercomparisonMetrics run for comparison ds_names = list(datasets.keys()) metrics = IntercomparisonMetrics( dataset_names=ds_names, # passing the names here explicitly, see GH issue #220 refname="reference_name", other_names=ds_names[1:], calc_tau=True, ) val = Validation( datasets, "reference_name", scaling=None, temporal_matcher=None, # use default here metrics_calculators={(4, 4): metrics.calc_metrics}) results = val.calc(1, 1, 1, rename_cols=False) # results is a dictionary with one entry and key # (('c1name', 'c1'), ('c2name', 'c2'), ('c3name', 'c3'), ('refname', # 'ref')), the value is a list of length 0, which contains a dictionary # with all the results, where the metrics are joined with "_between_" with # the combination of datasets, which is joined with "_and_", e.g. for R # between ``refname`` and ``c1name`` the key is # "R_between_refname_and_c1name" common_metrics = ["n_obs", "gpi", "lat", "lon"] pw_metrics = list(set(expected_metrics) - set(common_metrics)) # there's some sorting done at some point in pytesmo oldkey = tuple(sorted([(name, name.split("_")[0]) for name in ds_names])) res_old = results[oldkey] for key in results_pw: res = results_pw[key] # handle the full dataset metrics for m in common_metrics: assert_equal(res[m], res_old[m]) # now get the metrics and compare to the right combination for m in pw_metrics: othername = key[0][0] refname = key[1][0] if othername == "reference_name": # sorting might be different, see GH #220 othername = key[1][0] refname = key[0][0] old_m_key = f"{m}_between_{refname}_and_{othername}" if m == "BIAS": # PairwiseIntercomparisonMetrics has the result as (other, # ref), and therefore "bias between other and ref", compared to # "bias between ref and bias" in IntercomparisonMetrics # this is related to issue #220 assert_equal(np.abs(res[m]), np.abs(res_old[old_m_key])) elif m == "urmsd": # the old implementation differs from the new implementation pass else: assert_equal(res[m], res_old[old_m_key])
def test_validation_n3_k2_masking(): # test result for one gpi in a cell tst_results_one = { (('DS1', 'x'), ('DS3', 'y')): { 'n_obs': np.array([250], dtype=np.int32)}, (('DS1', 'x'), ('DS2', 'y')): { 'n_obs': np.array([250], dtype=np.int32)}, (('DS1', 'x'), ('DS3', 'x')): { 'n_obs': np.array([250], dtype=np.int32)}} # test result for two gpis in a cell tst_results_two = { (('DS1', 'x'), ('DS3', 'y')): { 'n_obs': np.array([250, 250], dtype=np.int32)}, (('DS1', 'x'), ('DS2', 'y')): { 'n_obs': np.array([250, 250], dtype=np.int32)}, (('DS1', 'x'), ('DS3', 'x')): { 'n_obs': np.array([250, 250], dtype=np.int32)}} # cell 4 in this example has two gpis so it returns different results. tst_results = {1: tst_results_one, 1: tst_results_one, 2: tst_results_two} datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid(np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4])) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { 'masking1': { 'class': mds1, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 500}, 'use_lut': False, 'grids_compatible': True}, 'masking2': { 'class': mds2, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 750}, 'use_lut': False, 'grids_compatible': True} } process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, masking_datasets=mds) gpi_info = (1, 1, 1) ref_df = datasets['DS1']['class'].read_ts(1) new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 250 nptest.assert_allclose(new_ref_df.x.values, np.arange(750, 1000)) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) tst = tst_results[len(job[0])] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal(results[key]['n_obs'], tst[tst_key]['n_obs'])
def test_ascat_ismn_validation_metadata_rolling(ascat_reader): """ Test processing framework with some ISMN and ASCAT sample data """ # Initialize ISMN reader ismn_data_folder = os.path.join( os.path.dirname(__file__), "..", "test-data", "ismn", "multinetwork", "header_values", ) ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids( variable="soil moisture", min_depth=0, max_depth=0.1 ) metadata_dict_template = { "network": np.array(["None"], dtype="U256"), "station": np.array(["None"], dtype="U256"), "landcover": np.float32([np.nan]), "climate": np.array(["None"], dtype="U4"), } for idx in ids: metadata = ismn_reader.metadata[idx] metadata_dict = [ { "network": metadata["network"], "station": metadata["station"], "landcover": metadata["landcover_2010"], "climate": metadata["climate"], } ] jobs.append( (idx, metadata["longitude"], metadata["latitude"], metadata_dict) ) save_path = tempfile.mkdtemp() # Create the validation object. datasets = { "ISMN": {"class": ismn_reader, "columns": ["soil moisture"]}, "ASCAT": { "class": ascat_reader, "columns": ["sm"], "kwargs": { "mask_frozen_prob": 80, "mask_snow_prob": 80, "mask_ssf": True, }, }, } read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager( datasets, "ISMN", period, read_ts_names=read_ts_names ) process = Validation( datasets, "ISMN", temporal_ref="ASCAT", scaling="lin_cdf_match", scaling_ref="ASCAT", metrics_calculators={ (2, 2): metrics_calculators.RollingMetrics( other_name="k1", metadata_template=metadata_dict_template ).calc_metrics }, period=period, ) for job in jobs: results = process.calc(*job) netcdf_results_manager( results, save_path, ts_vars=["R", "p_R", "RMSD"] ) results_fname = os.path.join( save_path, "ASCAT.sm_with_ISMN.soil moisture.nc" ) vars_should = [ u"gpi", u"lon", u"lat", u"R", u"p_R", u"time", u"idx", u"_row_size", ] for key, value in metadata_dict_template.items(): vars_should.append(key) network_should = np.array( [ "MAQU", "MAQU", "SCAN", "SCAN", "SCAN", "SOILSCAPE", "SOILSCAPE", "SOILSCAPE", ], dtype="U256", ) reader = PointDataResults(results_fname, read_only=True) df = reader.read_loc(None) nptest.assert_equal(sorted(network_should), sorted(df["network"].values)) assert np.all(df.gpi.values == np.arange(8)) assert reader.read_ts(0).index.size == 357 assert np.all( reader.read_ts(1).columns.values == np.array(["R", "p_R", "RMSD"]) )
def test_validation_n3_k2_masking_no_data_remains(): datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid( np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4]), ) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { "masking1": { "class": mds1, "columns": ["x"], "args": [], "kwargs": {"limit": 500}, "use_lut": False, "grids_compatible": True, }, "masking2": { "class": mds2, "columns": ["x"], "args": [], "kwargs": {"limit": 1000}, "use_lut": False, "grids_compatible": True, }, } process = Validation( datasets, "DS1", temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0 ).combinatory_matcher, scaling="lin_cdf_match", metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics( other_name="k1" ).calc_metrics }, masking_datasets=mds, ) gpi_info = (1, 1, 1) ref_df = datasets["DS1"]["class"].read(1) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 0 nptest.assert_allclose(new_ref_df.x.values, np.arange(1000, 1000)) jobs = process.get_processing_jobs() for job in jobs: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) results = process.calc(*job) tst = [] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal( results[key]["n_obs"], tst[tst_key]["n_obs"] )
# (3, 3): triple_collocation} # ``` # Create the variable ***save_path*** which is a string representing the path where the results will be saved. **DO NOT CHANGE** the name ***save_path*** because it will be searched during the parallel processing! # In[10]: save_path = tempfile.mkdtemp() # In[22]: import pprint for job in jobs: results = process.calc(*job) pprint.pprint(results) netcdf_results_manager(results, save_path) # The validation is then performed by looping over all the defined jobs and storing the results. # You can see that the results are a dictionary where the key is a tuple defining the exact combination of datasets and columns that were used for the calculation of the metrics. The metrics itself are a dictionary of `metric-name: numpy.ndarray` which also include information about the gpi, lon and lat. Since all the information contained in the job is given to the metric calculator they can be stored in the results. # # Storing of the results to disk is at the moment supported by the `netcdf_results_manager` which creates a netCDF file for each dataset combination and stores each metric as a variable. We can inspect the stored netCDF file which is named after the dictionary key: # In[23]: import netCDF4 results_fname = os.path.join(save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc') with netCDF4.Dataset(results_fname) as ds:
def test_ascat_ismn_validation(): """ Test processing framework with some ISMN and ASCAT sample data """ ascat_data_folder = os.path.join(os.path.dirname(__file__), 'test-data', 'sat', 'ascat', 'netcdf', '55R22') ascat_grid_folder = os.path.join(os.path.dirname(__file__), 'test-data', 'sat', 'ascat', 'netcdf', 'grid') ascat_reader = AscatH25_SSM(ascat_data_folder, ascat_grid_folder) ascat_reader.read_bulk = True ascat_reader._load_grid_info() # Initialize ISMN reader ismn_data_folder = os.path.join(os.path.dirname(__file__), 'test-data', 'ismn', 'multinetwork', 'header_values') ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids( variable='soil moisture', min_depth=0, max_depth=0.1) for idx in ids: metadata = ismn_reader.metadata[idx] jobs.append((idx, metadata['longitude'], metadata['latitude'])) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { 'ISMN': { 'class': ismn_reader, 'columns': [ 'soil moisture' ], 'type': 'reference', 'args': [], 'kwargs': {} }, 'ASCAT': { 'class': ascat_reader, 'columns': [ 'sm' ], 'type': 'other', 'args': [], 'kwargs': {}, 'grids_compatible': False, 'use_lut': False, 'lut_max_dist': 30000 } } period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] process = Validation( datasets=datasets, data_prep=DataPreparation(), temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0, reverse=True), scaling='lin_cdf_match', scale_to_other=True, metrics_calculator=metrics_calculators.BasicMetrics(), period=period, cell_based_jobs=False) for job in jobs: results = process.calc(job) netcdf_results_manager(results, save_path) results_fname = os.path.join( save_path, 'ISMN.soil moisture_with_ASCAT.sm.nc') vars_should = [u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau', u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R'] n_obs_should = [360, 385, 1644, 1881, 1927, 479, 140, 251] rho_should = np.array([0.54618734, 0.71739876, 0.62089276, 0.53246528, 0.30299741, 0.69647062, 0.840593, 0.73913699], dtype=np.float32) rmsd_should = np.array([11.53626347, 7.54565048, 17.45193481, 21.19371414, 14.24668026, 14.27493, 13.173215, 12.59192371], dtype=np.float32) with nc.Dataset(results_fname) as results: assert sorted(results.variables.keys()) == sorted(vars_should) assert sorted(results.variables['n_obs'][:].tolist()) == sorted( n_obs_should) nptest.assert_allclose(sorted(rho_should), sorted(results.variables['rho'][:])) nptest.assert_allclose(sorted(rmsd_should), sorted(results.variables['RMSD'][:]))
def test_ascat_ismn_validation(): """ Test processing framework with some ISMN and ASCAT sample data """ ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', '55R22') ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', 'grid') ascat_reader = AscatH25_SSM(ascat_data_folder, ascat_grid_folder) ascat_reader.read_bulk = True ascat_reader._load_grid_info() # Initialize ISMN reader ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'ismn', 'multinetwork', 'header_values') ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids( variable='soil moisture', min_depth=0, max_depth=0.1) for idx in ids: metadata = ismn_reader.metadata[idx] jobs.append((idx, metadata['longitude'], metadata['latitude'])) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { 'ISMN': { 'class': ismn_reader, 'columns': ['soil moisture'] }, 'ASCAT': { 'class': ascat_reader, 'columns': ['sm'], 'kwargs': {'mask_frozen_prob': 80, 'mask_snow_prob': 80, 'mask_ssf': True} }} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] process = Validation( datasets, 'ISMN', temporal_ref='ASCAT', scaling='lin_cdf_match', scaling_ref='ASCAT', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, period=period) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join( save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc') vars_should = [u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau', u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R'] n_obs_should = [360, 385, 1644, 1881, 1927, 479, 140, 251] rho_should = np.array([0.546187, 0.717398, 0.620892, 0.532465, 0.302997, 0.694713, 0.840592, 0.742065], dtype=np.float32) rmsd_should = np.array([11.536263, 7.545650, 17.451935, 21.193714, 14.246680, 14.494674, 13.173215, 12.903898], dtype=np.float32) with nc.Dataset(results_fname) as results: assert sorted(results.variables.keys()) == sorted(vars_should) assert sorted(results.variables['n_obs'][:].tolist()) == sorted( n_obs_should) nptest.assert_allclose(sorted(rho_should), sorted(results.variables['rho'][:]), rtol=1e-4) nptest.assert_allclose(sorted(rmsd_should), sorted(results.variables['RMSD'][:]), rtol=1e-4)
def test_validation_n3_k2_masking(): # test result for one gpi in a cell tst_results_one = { (('DS1', 'x'), ('DS3', 'y')): { 'n_obs': np.array([250], dtype=np.int32)}, (('DS1', 'x'), ('DS2', 'y')): { 'n_obs': np.array([250], dtype=np.int32)}, (('DS1', 'x'), ('DS3', 'x')): { 'n_obs': np.array([250], dtype=np.int32)}} # test result for two gpis in a cell tst_results_two = { (('DS1', 'x'), ('DS3', 'y')): { 'n_obs': np.array([250, 250], dtype=np.int32)}, (('DS1', 'x'), ('DS2', 'y')): { 'n_obs': np.array([250, 250], dtype=np.int32)}, (('DS1', 'x'), ('DS3', 'x')): { 'n_obs': np.array([250, 250], dtype=np.int32)}} # cell 4 in this example has two gpis so it returns different results. tst_results = {1: tst_results_one, 1: tst_results_one, 2: tst_results_two} datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid(np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4])) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { 'masking1': { 'class': mds1, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 500}, 'use_lut': False, 'grids_compatible': True}, 'masking2': { 'class': mds2, 'columns': ['x'], 'args': [], 'kwargs': {'limit': 750}, 'use_lut': False, 'grids_compatible': True} } process = Validation( datasets, 'DS1', temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0).combinatory_matcher, scaling='lin_cdf_match', metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, masking_datasets=mds) gpi_info = (1, 1, 1) ref_df = datasets['DS1']['class'].read_ts(1) new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 250 nptest.assert_allclose(new_ref_df.x.values, np.arange(750, 1000)) jobs = process.get_processing_jobs() for job in jobs: results = process.calc(*job) tst = tst_results[len(job[0])] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal(results[key]['n_obs'], tst[tst_key]['n_obs'])
def test_validation_n3_k2_masking(): # test result for one gpi in a cell tst_results_one = { (("DS1", "x"), ("DS3", "y")): { "n_obs": np.array([250], dtype=np.int32) }, (("DS1", "x"), ("DS2", "y")): { "n_obs": np.array([250], dtype=np.int32) }, (("DS1", "x"), ("DS3", "x")): { "n_obs": np.array([250], dtype=np.int32) }, (("DS2", "y"), ("DS3", "x")): { "n_obs": np.array([250], dtype=np.int32) }, (("DS2", "y"), ("DS3", "y")): { "n_obs": np.array([250], dtype=np.int32) }, } # test result for two gpis in a cell tst_results_two = { (("DS1", "x"), ("DS3", "y")): { "n_obs": np.array([250, 250], dtype=np.int32) }, (("DS1", "x"), ("DS2", "y")): { "n_obs": np.array([250, 250], dtype=np.int32) }, (("DS1", "x"), ("DS3", "x")): { "n_obs": np.array([250, 250], dtype=np.int32) }, (("DS2", "y"), ("DS3", "x")): { "n_obs": np.array([250, 250], dtype=np.int32) }, (("DS2", "y"), ("DS3", "y")): { "n_obs": np.array([250, 250], dtype=np.int32) }, } # cell 4 in this example has two gpis so it returns different results. tst_results = {1: tst_results_one, 1: tst_results_one, 2: tst_results_two} datasets = setup_TestDatasets() # setup masking datasets grid = grids.CellGrid( np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]), np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4]), ) mds1 = GriddedTsBase("", grid, MaskingTestDataset) mds2 = GriddedTsBase("", grid, MaskingTestDataset) mds = { "masking1": { "class": mds1, "columns": ["x"], "args": [], "kwargs": {"limit": 500}, "use_lut": False, "grids_compatible": True, }, "masking2": { "class": mds2, "columns": ["x"], "args": [], "kwargs": {"limit": 750}, "use_lut": False, "grids_compatible": True, }, } process = Validation( datasets, "DS1", temporal_matcher=temporal_matchers.BasicTemporalMatching( window=1 / 24.0 ).combinatory_matcher, scaling="lin_cdf_match", metrics_calculators={ (3, 2): metrics_calculators.BasicMetrics( other_name="k1" ).calc_metrics }, masking_datasets=mds, ) gpi_info = (1, 1, 1) ref_df = datasets["DS1"]["class"].read(1) with warnings.catch_warnings(): warnings.simplefilter( "ignore", category=DeprecationWarning ) # read_ts is hard coded when using mask_data new_ref_df = process.mask_dataset(ref_df, gpi_info) assert len(new_ref_df) == 250 nptest.assert_allclose(new_ref_df.x.values, np.arange(750, 1000)) jobs = process.get_processing_jobs() for job in jobs: with warnings.catch_warnings(): # most warnings here are caused by the read_ts function that cannot # be changed when using a masking data set warnings.simplefilter("ignore", category=DeprecationWarning) results = process.calc(*job) tst = tst_results[len(job[0])] assert sorted(list(results)) == sorted(list(tst)) for key, tst_key in zip(sorted(results), sorted(tst)): nptest.assert_almost_equal( results[key]["n_obs"], tst[tst_key]["n_obs"] )
def test_ascat_ismn_validation(ascat_reader, ismn_reader): """ Test processing framework with some ISMN and ASCAT sample data """ jobs = [] ids = ismn_reader.get_dataset_ids(variable="soil moisture", min_depth=0, max_depth=0.1) for idx in ids: metadata = ismn_reader.metadata[idx] jobs.append((idx, metadata["longitude"], metadata["latitude"])) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { "ISMN": { "class": ismn_reader, "columns": ["soil moisture"] }, "ASCAT": { "class": ascat_reader, "columns": ["sm"], "kwargs": { "mask_frozen_prob": 80, "mask_snow_prob": 80, "mask_ssf": True, }, }, } read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager(datasets, "ISMN", period, read_ts_names=read_ts_names) process = Validation( datasets, "ISMN", temporal_ref="ASCAT", scaling="lin_cdf_match", scaling_ref="ASCAT", metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name="k1").calc_metrics }, period=period, ) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join(save_path, "ASCAT.sm_with_ISMN.soil moisture.nc") # targets target_vars = { "n_obs": [357, 384, 1646, 1875, 1915, 467, 141, 251], "rho": np.array([ 0.53934574, 0.7002289, 0.62200236, 0.53647155, 0.30413666, 0.6740655, 0.8418981, 0.74206454 ], dtype=np.float32), "RMSD": np.array([ 11.583476, 7.729667, 17.441547, 21.125721, 14.31557, 14.187225, 13.0622425, 12.903898 ], dtype=np.float32) } check_results( filename=results_fname, target_vars=target_vars, )
def test_ascat_ismn_validation(): """ Test processing framework with some ISMN and ASCAT sample data """ ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', '55R22') ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', 'grid') ascat_reader = AscatH25_SSM(ascat_data_folder, ascat_grid_folder) ascat_reader.read_bulk = True ascat_reader._load_grid_info() # Initialize ISMN reader ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'ismn', 'multinetwork', 'header_values') ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids(variable='soil moisture', min_depth=0, max_depth=0.1) for idx in ids: metadata = ismn_reader.metadata[idx] jobs.append((idx, metadata['longitude'], metadata['latitude'])) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { 'ISMN': { 'class': ismn_reader, 'columns': ['soil moisture'] }, 'ASCAT': { 'class': ascat_reader, 'columns': ['sm'], 'kwargs': { 'mask_frozen_prob': 80, 'mask_snow_prob': 80, 'mask_ssf': True } } } period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] process = Validation( datasets, 'ISMN', temporal_ref='ASCAT', scaling='lin_cdf_match', scaling_ref='ASCAT', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics }, period=period) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join(save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc') vars_should = [ u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau', u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R' ] n_obs_should = [360, 385, 1644, 1881, 1927, 479, 140, 251] rho_should = np.array([ 0.546187, 0.717398, 0.620892, 0.532465, 0.302997, 0.694713, 0.840592, 0.742065 ], dtype=np.float32) rmsd_should = np.array([ 11.536263, 7.545650, 17.451935, 21.193714, 14.246680, 14.494674, 13.173215, 12.903898 ], dtype=np.float32) with nc.Dataset(results_fname) as results: assert sorted(results.variables.keys()) == sorted(vars_should) assert sorted( results.variables['n_obs'][:].tolist()) == sorted(n_obs_should) nptest.assert_allclose(sorted(rho_should), sorted(results.variables['rho'][:]), rtol=1e-4) nptest.assert_allclose(sorted(rmsd_should), sorted(results.variables['RMSD'][:]), rtol=1e-4)
def test_ascat_ismn_validation_metadata(ascat_reader, ismn_reader): """ Test processing framework with some ISMN and ASCAT sample data """ jobs = [] ids = ismn_reader.get_dataset_ids(variable="soil moisture", min_depth=0, max_depth=0.1) metadata_dict_template = { "network": np.array(["None"], dtype="U256"), "station": np.array(["None"], dtype="U256"), "landcover": np.float32([np.nan]), "climate": np.array(["None"], dtype="U4"), } for idx in ids: metadata = ismn_reader.metadata[idx] metadata_dict = [{ "network": metadata["network"], "station": metadata["station"], "landcover": metadata["landcover_2010"], "climate": metadata["climate"], }] jobs.append( (idx, metadata["longitude"], metadata["latitude"], metadata_dict)) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { "ISMN": { "class": ismn_reader, "columns": ["soil moisture"], }, "ASCAT": { "class": ascat_reader, "columns": ["sm"], "kwargs": { "mask_frozen_prob": 80, "mask_snow_prob": 80, "mask_ssf": True, }, }, } read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager(datasets, "ISMN", period, read_ts_names=read_ts_names) process = Validation( datasets, "ISMN", temporal_ref="ASCAT", scaling="lin_cdf_match", scaling_ref="ASCAT", metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics( other_name="k1", metadata_template=metadata_dict_template).calc_metrics }, period=period, ) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join(save_path, "ASCAT.sm_with_ISMN.soil moisture.nc") target_vars = { "n_obs": [357, 384, 1646, 1875, 1915, 467, 141, 251], "rho": np.array([ 0.53934574, 0.7002289, 0.62200236, 0.53647155, 0.30413666, 0.6740655, 0.8418981, 0.74206454, ], dtype=np.float32), "RMSD": np.array([ 11.583476, 7.729667, 17.441547, 21.125721, 14.31557, 14.187225, 13.0622425, 12.903898, ], dtype=np.float32), "network": np.array( [ "MAQU", "MAQU", "SCAN", "SCAN", "SCAN", "SOILSCAPE", "SOILSCAPE", "SOILSCAPE", ], dtype="U256", ) } vars_should = [ 'BIAS', 'R', 'RMSD', '_row_size', 'climate', 'gpi', 'idx', 'landcover', 'lat', 'lon', 'n_obs', 'network', 'p_R', 'p_rho', 'p_tau', 'rho', 'station', 'tau', 'time' ] check_results(filename=results_fname, target_vars=target_vars, variables=vars_should)
def test_validation_with_averager(ascat_reader, ismn_reader): """ Test processing framework with averaging module. ASCAT and ISMN data are used here with no geographical considerations (the lut is provided more upstream and contains this information already) """ while hasattr(ascat_reader, 'cls'): ascat_reader = ascat_reader.cls # lookup table between the ascat and ismn points - not geographically correct upscaling_lut = { "ISMN": { 1814367: [(0, 102.1333, 33.8833), (1, 102.1333, 33.6666)], 1803695: [(2, -86.55, 34.783), (3, -97.083, 37.133), (4, -105.417, 34.25)], 1856312: [(5, -120.9675, 38.43003), (6, -120.78559, 38.14956), (7, -120.80639, 38.17353)] } } gpis = (1814367, 1803695, 1856312) lons, lats = [], [] for gpi in gpis: lon, lat = ascat_reader.grid.gpi2lonlat(gpi) lons.append(lon) lats.append(lat) jobs = [(gpis, lons, lats)] # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { "ASCAT": { "class": ascat_reader, "columns": ["sm"], "kwargs": { "mask_frozen_prob": 80, "mask_snow_prob": 80, "mask_ssf": True, } }, "ISMN": { "class": ismn_reader, "columns": ["soil moisture"], }, } read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager( datasets, "ASCAT", period, read_ts_names=read_ts_names, upscale_parms={ "upscaling_method": "average", "temporal_stability": True, "upscaling_lut": upscaling_lut, }, ) process = Validation( datasets, "ASCAT", temporal_ref="ISMN", scaling="lin_cdf_match", scaling_ref="ISMN", metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name="k1").calc_metrics }, period=period, ) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join(save_path, "ASCAT.sm_with_ISMN.soil moisture.nc") target_vars = { "n_obs": [764, 2392, 904], "rho": np.array([-0.012487, 0.255156, 0.635517], dtype=np.float32), "RMSD": np.array([0.056428, 0.056508, 0.116294], dtype=np.float32), "R": np.array([-0.012335, 0.257671, 0.657239], dtype=np.float32) } check_results( filename=results_fname, target_vars=target_vars, )
def test_ascat_ismn_validation_metadata_rolling(): """ Test processing framework with some ISMN and ASCAT sample data """ ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', '55R22') ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', 'grid') static_layers_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'h_saf', 'static_layer') ascat_reader = AscatSsmCdr(ascat_data_folder, ascat_grid_folder, grid_filename='TUW_WARP5_grid_info_2_1.nc', static_layer_path=static_layers_folder) ascat_reader.read_bulk = True # Initialize ISMN reader ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'ismn', 'multinetwork', 'header_values') ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids(variable='soil moisture', min_depth=0, max_depth=0.1) metadata_dict_template = { 'network': np.array(['None'], dtype='U256'), 'station': np.array(['None'], dtype='U256'), 'landcover': np.float32([np.nan]), 'climate': np.array(['None'], dtype='U4') } for idx in ids: metadata = ismn_reader.metadata[idx] metadata_dict = [{ 'network': metadata['network'], 'station': metadata['station'], 'landcover': metadata['landcover_2010'], 'climate': metadata['climate'] }] jobs.append( (idx, metadata['longitude'], metadata['latitude'], metadata_dict)) save_path = tempfile.mkdtemp() # Create the validation object. datasets = { 'ISMN': { 'class': ismn_reader, 'columns': ['soil moisture'] }, 'ASCAT': { 'class': ascat_reader, 'columns': ['sm'], 'kwargs': { 'mask_frozen_prob': 80, 'mask_snow_prob': 80, 'mask_ssf': True } } } read_ts_names = {'ASCAT': 'read', 'ISMN': 'read_ts'} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager(datasets, 'ISMN', period, read_ts_names=read_ts_names) process = Validation( datasets, 'ISMN', temporal_ref='ASCAT', scaling='lin_cdf_match', scaling_ref='ASCAT', metrics_calculators={ (2, 2): metrics_calculators.RollingMetrics( other_name='k1', metadata_template=metadata_dict_template).calc_metrics }, period=period) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path, ts_vars=['R', 'p_R', 'RMSD']) results_fname = os.path.join(save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc') vars_should = [ u'gpi', u'lon', u'lat', u'R', u'p_R', u'time', u'idx', u'_row_size' ] for key, value in metadata_dict_template.items(): vars_should.append(key) network_should = np.array([ 'MAQU', 'MAQU', 'SCAN', 'SCAN', 'SCAN', 'SOILSCAPE', 'SOILSCAPE', 'SOILSCAPE' ], dtype='U256') reader = PointDataResults(results_fname, read_only=True) df = reader.read_loc(None) nptest.assert_equal(sorted(network_should), sorted(df['network'].values)) assert np.all(df.gpi.values == np.arange(8)) assert (reader.read_ts(0).index.size == 357) assert np.all( reader.read_ts(1).columns.values == np.array(['R', 'p_R', 'RMSD']))
def test_ascat_ismn_validation(): """ Test processing framework with some ISMN and ASCAT sample data """ ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', '55R22') ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', 'grid') static_layers_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'h_saf', 'static_layer') ascat_reader = AscatSsmCdr(ascat_data_folder, ascat_grid_folder, static_layer_path=static_layers_folder) ascat_reader.read_bulk = True # Initialize ISMN reader ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'ismn', 'multinetwork', 'header_values') ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids( variable='soil moisture', min_depth=0, max_depth=0.1) for idx in ids: metadata = ismn_reader.metadata[idx] jobs.append((idx, metadata['longitude'], metadata['latitude'])) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { 'ISMN': { 'class': ismn_reader, 'columns': ['soil moisture'] }, 'ASCAT': { 'class': ascat_reader, 'columns': ['sm'], 'kwargs': {'mask_frozen_prob': 80, 'mask_snow_prob': 80, 'mask_ssf': True} }} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] process = Validation( datasets, 'ISMN', temporal_ref='ASCAT', scaling='lin_cdf_match', scaling_ref='ASCAT', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, period=period) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join( save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc') vars_should = [u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau', u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R'] n_obs_should = [384, 357, 482, 141, 251, 1927, 1887, 1652] rho_should = np.array([0.70022893, 0.53934574, 0.69356072, 0.84189808, 0.74206454, 0.30299741, 0.53143877, 0.62204134], dtype=np.float32) rmsd_should = np.array([7.72966719, 11.58347607, 14.57700157, 13.06224251, 12.90389824, 14.24668026, 21.19682884, 17.3883934], dtype=np.float32) with nc.Dataset(results_fname) as results: assert sorted(results.variables.keys()) == sorted(vars_should) assert sorted(results.variables['n_obs'][:].tolist()) == sorted( n_obs_should) nptest.assert_allclose(sorted(rho_should), sorted(results.variables['rho'][:]), rtol=1e-4) nptest.assert_allclose(sorted(rmsd_should), sorted(results.variables['RMSD'][:]), rtol=1e-4)
def test_temporal_matching_ascat_ismn(): """ This test uses a CSV file of ASCAT and ISMN data to test if the temporal matching within the validation works as epxected in a "real" setup. This only tests whether the number of observations matches, because this is the main thing the temporal matching influences. """ # test with ASCAT and ISMN data here = Path(__file__).resolve().parent ascat = pd.read_csv(here / "ASCAT.csv", index_col=0, parse_dates=True) ismn = pd.read_csv(here / "ISMN.csv", index_col=0, parse_dates=True) dfs = {"ASCAT": ascat, "ISMN": ismn} columns = {"ASCAT": "sm", "ISMN": "soil_moisture"} refname = "ISMN" window = pd.Timedelta(12, "H") old_matcher = BasicTemporalMatching().combinatory_matcher new_matcher = make_combined_temporal_matcher(window) datasets = {} for key in ["ISMN", "ASCAT"]: all_columns = list(dfs[key].columns) ds = {"columns": [columns[key]], "class": DummyReader(dfs[key], all_columns)} datasets[key] = ds new_val = Validation( datasets, refname, scaling=None, # doesn't work with the constant test data temporal_matcher=new_matcher, metrics_calculators={ (2, 2): PairwiseIntercomparisonMetrics().calc_metrics } ) new_results = new_val.calc( 1, 1, 1, rename_cols=False, only_with_temporal_ref=True ) # old setup ds_names = list(datasets.keys()) metrics = IntercomparisonMetrics( dataset_names=ds_names, # passing the names here explicitly, see GH issue #220 refname=refname, other_names=ds_names[1:], calc_tau=True, ) old_val = Validation( datasets, refname, scaling=None, # doesn't work with the constant test data temporal_matcher=old_matcher, metrics_calculators={ (2, 2): metrics.calc_metrics } ) old_results = old_val.calc( 1, 1, 1, rename_cols=False ) old_key = (('ASCAT', 'sm'), ('ISMN', 'soil_moisture')) new_key = (('ASCAT', 'sm'), ('ISMN', 'soil_moisture')) assert old_results[old_key]["n_obs"] == new_results[new_key]["n_obs"]
def test_PairwiseIntercomparisonMetrics_confidence_intervals(): # tests if the correct confidence intervals are returned datasets, _ = testdata_random() matcher = make_combined_temporal_matcher(pd.Timedelta(6, "H")) val = Validation( datasets, "reference_name", scaling=None, # doesn't work with the constant test data temporal_matcher=matcher, metrics_calculators={ (4, 2): (PairwiseIntercomparisonMetrics( calc_spearman=True, calc_kendall=True, analytical_cis=True, bootstrap_cis=True, ).calc_metrics) }) results_pw = val.calc([1], [1], [1], rename_cols=False) metrics_with_ci = { "BIAS": "bias", "R": "pearson_r", "rho": "spearman_r", "tau": "kendall_tau", "RMSD": "rmsd", "urmsd": "ubrmsd", "mse": "msd", "mse_bias": "mse_bias", } metrics_with_bs_ci = { "mse_corr": "mse_corr", "mse_var": "mse_var", } # reconstruct dataframe frames = [] for key in datasets: frames.append(datasets[key]["class"].data) data = pd.concat(frames, axis=1) data.dropna(how="any", inplace=True) for key in results_pw: othername = key[0][0] other_col = othername.split("_")[0] other = data[other_col].values refname = key[1][0] ref_col = refname.split("_")[0] ref = data[ref_col].values for metric_key in metrics_with_ci: lower = results_pw[key][f"{metric_key}_ci_lower"] upper = results_pw[key][f"{metric_key}_ci_upper"] # calculate manually from data metric_func = getattr(pairwise, metrics_with_ci[metric_key]) m, lb, ub = with_analytical_ci(metric_func, other, ref) # difference due to float32 vs. float64 assert_almost_equal(upper, ub, 6) assert_almost_equal(lower, lb, 6) for metric_key in metrics_with_bs_ci: lower = results_pw[key][f"{metric_key}_ci_lower"] upper = results_pw[key][f"{metric_key}_ci_upper"] # calculate manually from data metric_func = getattr(pairwise, metrics_with_bs_ci[metric_key]) m, lb, ub = with_bootstrapped_ci(metric_func, other, ref) assert_allclose(upper, ub, rtol=1e-1, atol=1e-4) assert_allclose(lower, lb, rtol=1e-1, atol=1e-4)
def test_ascat_ismn_validation_metadata(ascat_reader): """ Test processing framework with some ISMN and ASCAT sample data """ # Initialize ISMN reader ismn_data_folder = os.path.join( os.path.dirname(__file__), "..", "test-data", "ismn", "multinetwork", "header_values", ) ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids( variable="soil moisture", min_depth=0, max_depth=0.1 ) metadata_dict_template = { "network": np.array(["None"], dtype="U256"), "station": np.array(["None"], dtype="U256"), "landcover": np.float32([np.nan]), "climate": np.array(["None"], dtype="U4"), } for idx in ids: metadata = ismn_reader.metadata[idx] metadata_dict = [ { "network": metadata["network"], "station": metadata["station"], "landcover": metadata["landcover_2010"], "climate": metadata["climate"], } ] jobs.append( (idx, metadata["longitude"], metadata["latitude"], metadata_dict) ) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { "ISMN": { "class": ismn_reader, "columns": ["soil moisture"], }, "ASCAT": { "class": ascat_reader, "columns": ["sm"], "kwargs": { "mask_frozen_prob": 80, "mask_snow_prob": 80, "mask_ssf": True, }, }, } read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] datasets = DataManager( datasets, "ISMN", period, read_ts_names=read_ts_names ) process = Validation( datasets, "ISMN", temporal_ref="ASCAT", scaling="lin_cdf_match", scaling_ref="ASCAT", metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics( other_name="k1", metadata_template=metadata_dict_template ).calc_metrics }, period=period, ) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join( save_path, "ASCAT.sm_with_ISMN.soil moisture.nc" ) vars_should = [ u"n_obs", u"tau", u"gpi", u"RMSD", u"lon", u"p_tau", u"BIAS", u"p_rho", u"rho", u"lat", u"R", u"p_R", u"time", u"idx", u"_row_size", ] for key, value in metadata_dict_template.items(): vars_should.append(key) n_obs_should = [357, 384, 1646, 1875, 1915, 467, 141, 251] rho_should = np.array( [ 0.53934574, 0.7002289, 0.62200236, 0.53647155, 0.30413666, 0.6740655, 0.8418981, 0.74206454, ], dtype=np.float32, ) rmsd_should = np.array( [ 11.583476, 7.729667, 17.441547, 21.125721, 14.31557, 14.187225, 13.0622425, 12.903898, ], dtype=np.float32, ) network_should = np.array( [ "MAQU", "MAQU", "SCAN", "SCAN", "SCAN", "SOILSCAPE", "SOILSCAPE", "SOILSCAPE", ], dtype="U256", ) with nc.Dataset(results_fname, mode="r") as results: vars = results.variables.keys() n_obs = results.variables["n_obs"][:].tolist() rho = results.variables["rho"][:] rmsd = results.variables["RMSD"][:] network = results.variables["network"][:] assert sorted(vars) == sorted(vars_should) assert sorted(n_obs) == sorted(n_obs_should) nptest.assert_allclose(sorted(rho), sorted(rho_should), rtol=1e-4) nptest.assert_allclose(sorted(rmsd), sorted(rmsd_should), rtol=1e-4) nptest.assert_equal(sorted(network), sorted(network_should))
# # ```python # { (3 ,2): metric_calc, # (3, 3): triple_collocation} # ``` # # Create the variable ***save_path*** which is a string representing the path where the results will be saved. # **DO NOT CHANGE** the name ***save_path*** because it will be searched during the parallel processing! # In[9]: save_path = output_folder import pprint for job in jobs: results = process.calc(*job) pprint.pprint(results) netcdf_results_manager(results, save_path) # The validation is then performed by looping over all the defined jobs and storing the results. # You can see that the results are a dictionary where the key is a tuple defining the exact combination of datasets # and columns that were used for the calculation of the metrics. The metrics itself are a dictionary of `metric-name: # numpy.ndarray` which also include information about the gpi, lon and lat. Since all the information contained in # the job is given to the metric calculator they can be stored in the results. # # Storing of the results to disk is at the moment supported by the `netcdf_results_manager` which creates a netCDF # file for each dataset combination and stores each metric as a variable. We can inspect the stored netCDF file which # is named after the dictionary key: # In[10]:
def test_ascat_ismn_validation(): """ Test processing framework with some ISMN and ASCAT sample data """ ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', '55R22') ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'ascat', 'netcdf', 'grid') static_layers_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'sat', 'h_saf', 'static_layer') ascat_reader = AscatSsmCdr(ascat_data_folder, ascat_grid_folder, grid_filename='TUW_WARP5_grid_info_2_1.nc', static_layer_path=static_layers_folder) ascat_reader.read_bulk = True # Initialize ISMN reader ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data', 'ismn', 'multinetwork', 'header_values') ismn_reader = ISMN_Interface(ismn_data_folder) jobs = [] ids = ismn_reader.get_dataset_ids( variable='soil moisture', min_depth=0, max_depth=0.1) for idx in ids: metadata = ismn_reader.metadata[idx] jobs.append((idx, metadata['longitude'], metadata['latitude'])) # Create the variable ***save_path*** which is a string representing the # path where the results will be saved. **DO NOT CHANGE** the name # ***save_path*** because it will be searched during the parallel # processing! save_path = tempfile.mkdtemp() # Create the validation object. datasets = { 'ISMN': { 'class': ismn_reader, 'columns': ['soil moisture'] }, 'ASCAT': { 'class': ascat_reader, 'columns': ['sm'], 'kwargs': {'mask_frozen_prob': 80, 'mask_snow_prob': 80, 'mask_ssf': True} }} period = [datetime(2007, 1, 1), datetime(2014, 12, 31)] process = Validation( datasets, 'ISMN', temporal_ref='ASCAT', scaling='lin_cdf_match', scaling_ref='ASCAT', metrics_calculators={ (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics}, period=period) for job in jobs: results = process.calc(*job) netcdf_results_manager(results, save_path) results_fname = os.path.join( save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc') vars_should = [u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau', u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R'] n_obs_should = [384, 357, 482, 141, 251, 1927, 1887, 1652] rho_should = np.array([0.70022893, 0.53934574, 0.69356072, 0.84189808, 0.74206454, 0.30299741, 0.53143877, 0.62204134], dtype=np.float32) rmsd_should = np.array([7.72966719, 11.58347607, 14.57700157, 13.06224251, 12.90389824, 14.24668026, 21.19682884, 17.3883934], dtype=np.float32) with nc.Dataset(results_fname, mode='r') as results: assert sorted(results.variables.keys()) == sorted(vars_should) assert sorted(results.variables['n_obs'][:].tolist()) == sorted( n_obs_should) nptest.assert_allclose(sorted(rho_should), sorted(results.variables['rho'][:]), rtol=1e-4) nptest.assert_allclose(sorted(rmsd_should), sorted(results.variables['RMSD'][:]), rtol=1e-4)