示例#1
0
def test_validation_n3_k2():

    tst_results = {
        (('DS1', 'x'), ('DS3', 'y')): {
            'n_obs': np.array([1000], dtype=np.int32),
            'tau': np.array([np.nan], dtype=np.float32),
            'gpi': np.array([4], dtype=np.int32),
            'RMSD': np.array([0.], dtype=np.float32),
            'lon': np.array([4.]),
            'p_tau': np.array([np.nan], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'lat': np.array([4.]),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32)},
        (('DS1', 'x'), ('DS2', 'y')): {
            'n_obs': np.array([1000], dtype=np.int32),
            'tau': np.array([np.nan], dtype=np.float32),
            'gpi': np.array([4], dtype=np.int32),
            'RMSD': np.array([0.], dtype=np.float32),
            'lon': np.array([4.]),
            'p_tau': np.array([np.nan], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'lat': np.array([4.]),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32)},
        (('DS1', 'x'), ('DS3', 'x')): {
            'n_obs': np.array([1000], dtype=np.int32),
            'tau': np.array([np.nan], dtype=np.float32),
            'gpi': np.array([4], dtype=np.int32),
            'RMSD': np.array([0.], dtype=np.float32),
            'lon': np.array([4.]),
            'p_tau': np.array([np.nan], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'lat': np.array([4.]),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32)}}

    datasets = setup_TestDatasets()

    process = Validation(
        datasets, 'DS1',
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling='lin_cdf_match',
        metrics_calculators={
            (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics})

    jobs = process.get_processing_jobs()
    for job in jobs:
        results = process.calc(*job)
        assert sorted(list(results)) == sorted(list(tst_results))
示例#2
0
def test_validation_n3_k2_masking_no_data_remains():

    datasets = setup_TestDatasets()

    # setup masking datasets

    grid = grids.CellGrid(np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]),
                          np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4]))

    mds1 = GriddedTsBase("", grid, MaskingTestDataset)
    mds2 = GriddedTsBase("", grid, MaskingTestDataset)

    mds = {
        'masking1': {
            'class': mds1,
            'columns': ['x'],
            'args': [],
            'kwargs': {'limit': 500},
            'use_lut': False,
            'grids_compatible': True},
        'masking2': {
            'class': mds2,
            'columns': ['x'],
            'args': [],
            'kwargs': {'limit': 1000},
            'use_lut': False,
            'grids_compatible': True}
    }

    process = Validation(
        datasets, 'DS1',
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling='lin_cdf_match',
        metrics_calculators={
            (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics},
        masking_datasets=mds)

    gpi_info = (1, 1, 1)
    ref_df = datasets['DS1']['class'].read_ts(1)
    new_ref_df = process.mask_dataset(ref_df, gpi_info)
    assert len(new_ref_df) == 0
    nptest.assert_allclose(new_ref_df.x.values, np.arange(1000, 1000))
    jobs = process.get_processing_jobs()
    for job in jobs:
        results = process.calc(*job)
        tst = []
        assert sorted(list(results)) == sorted(list(tst))
        for key, tst_key in zip(sorted(results),
                                sorted(tst)):
            nptest.assert_almost_equal(results[key]['n_obs'],
                                       tst[tst_key]['n_obs'])
示例#3
0
def test_validation_n2_k2_temporal_matching_no_matches():

    tst_results = {}

    datasets = setup_two_without_overlap()

    process = Validation(
        datasets, 'DS1',
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling='lin_cdf_match',
        metrics_calculators={
            (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics})

    jobs = process.get_processing_jobs()
    for job in jobs:
        results = process.calc(*job)
        assert sorted(list(results)) == sorted(list(tst_results))
示例#4
0
def test_validation_n3_k2():
    tst_results = {
        (("DS1", "x"), ("DS3", "y")): {
            "n_obs": np.array([1000], dtype=np.int32),
            "tau": np.array([np.nan], dtype=np.float32),
            "gpi": np.array([4], dtype=np.int32),
            "RMSD": np.array([0.0], dtype=np.float32),
            "lon": np.array([4.0]),
            "p_tau": np.array([np.nan], dtype=np.float32),
            "BIAS": np.array([0.0], dtype=np.float32),
            "p_rho": np.array([0.0], dtype=np.float32),
            "rho": np.array([1.0], dtype=np.float32),
            "lat": np.array([4.0]),
            "R": np.array([1.0], dtype=np.float32),
            "p_R": np.array([0.0], dtype=np.float32),
        },
        (("DS1", "x"), ("DS2", "y")): {
            "n_obs": np.array([1000], dtype=np.int32),
            "tau": np.array([np.nan], dtype=np.float32),
            "gpi": np.array([4], dtype=np.int32),
            "RMSD": np.array([0.0], dtype=np.float32),
            "lon": np.array([4.0]),
            "p_tau": np.array([np.nan], dtype=np.float32),
            "BIAS": np.array([0.0], dtype=np.float32),
            "p_rho": np.array([0.0], dtype=np.float32),
            "rho": np.array([1.0], dtype=np.float32),
            "lat": np.array([4.0]),
            "R": np.array([1.0], dtype=np.float32),
            "p_R": np.array([0.0], dtype=np.float32),
        },
        (("DS1", "x"), ("DS3", "x")): {
            "n_obs": np.array([1000], dtype=np.int32),
            "tau": np.array([np.nan], dtype=np.float32),
            "gpi": np.array([4], dtype=np.int32),
            "RMSD": np.array([0.0], dtype=np.float32),
            "lon": np.array([4.0]),
            "p_tau": np.array([np.nan], dtype=np.float32),
            "BIAS": np.array([0.0], dtype=np.float32),
            "p_rho": np.array([0.0], dtype=np.float32),
            "rho": np.array([1.0], dtype=np.float32),
            "lat": np.array([4.0]),
            "R": np.array([1.0], dtype=np.float32),
            "p_R": np.array([0.0], dtype=np.float32),
        },
        (("DS2", "y"), ("DS3", "x")): {
            "gpi": np.array([4], dtype=np.int32),
            "lon": np.array([4.0]),
            "lat": np.array([4.0]),
            "n_obs": np.array([1000], dtype=np.int32),
            "R": np.array([1.0], dtype=np.float32),
            "p_R": np.array([0.0], dtype=np.float32),
            "rho": np.array([1.0], dtype=np.float32),
            "p_rho": np.array([0.0], dtype=np.float32),
            "RMSD": np.array([0.0], dtype=np.float32),
            "BIAS": np.array([0.0], dtype=np.float32),
            "tau": np.array([np.nan], dtype=np.float32),
            "p_tau": np.array([np.nan], dtype=np.float32),
        },
        (("DS2", "y"), ("DS3", "y")): {
            "gpi": np.array([4], dtype=np.int32),
            "lon": np.array([4.0]),
            "lat": np.array([4.0]),
            "n_obs": np.array([1000], dtype=np.int32),
            "R": np.array([1.0], dtype=np.float32),
            "p_R": np.array([0.0], dtype=np.float32),
            "rho": np.array([1.0], dtype=np.float32),
            "p_rho": np.array([0.0], dtype=np.float32),
            "RMSD": np.array([0.0], dtype=np.float32),
            "BIAS": np.array([0.0], dtype=np.float32),
            "tau": np.array([np.nan], dtype=np.float32),
            "p_tau": np.array([np.nan], dtype=np.float32),
        },
    }

    datasets = setup_TestDatasets()
    dm = DataManager(
        datasets,
        "DS1",
        read_ts_names={d: "read"
                       for d in ["DS1", "DS2", "DS3"]},
    )

    process = Validation(
        dm,
        "DS1",
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling="lin_cdf_match",
        metrics_calculators={
            (3, 2):
            metrics_calculators.BasicMetrics(other_name="k1").calc_metrics
        },
    )

    jobs = process.get_processing_jobs()
    for job in jobs:
        results = process.calc(*job)
        assert sorted(list(results)) == sorted(list(tst_results))
示例#5
0
def getdata():
    """
    handles the get request, which should contain the arguments listes under
    parameters

    Parameters
    ----------
    station_id: int
        id of station in database
    scaling: string
        chosen scaling method , for available choices see general.times_eries.scaling
    snow_depth: float
        mask snow depth greater than this value
    st_l1: float
        mask surface temperature layer1 lower than this value
    air_temp: float
        mask 2m air temperature lower than this value
    ssf_masking: boolean
        use SSF for masking true or false
    """
    station_id = request.args.get('station_id')
    scaling = request.args.get('scaling')
    if scaling == 'noscale':
        scaling = None
    masking_ids = request.args.getlist('masking_ds[]')
    masking_ops = request.args.getlist('masking_op[]')
    masking_values = request.args.getlist('masking_values[]')
    masking_values = [float(x) for x in masking_values]

    anomaly = request.args.get('anomaly')
    if anomaly == 'none':
        anomaly = None

    (depth_from,
     depth_to,
     sensor_id) = get_station_first_sm_layer(app.config['ISMN_PATH'],
                                             station_id)
    lon, lat = get_station_lonlat(app.config['ISMN_PATH'],
                                  station_id)
    start, end = get_station_start_end(app.config['ISMN_PATH'],
                                       station_id, "soil moisture",
                                       depth_from, depth_to)
    period = [start, end]

    masking_data = {'labels': [], 'data': []}
    masking_meta = get_masking_metadata()
    masking_masked_dict = None
    if len(masking_ids) > 0:
        # prepare masking datasets
        masking_ds_dict = get_masking_ds_dict(masking_ids)
        masking_masked_dict = {}
        for masking_ds, masking_op, masking_value in zip(masking_ids,
                                                         masking_ops,
                                                         masking_values):

            masking_masked_dict[masking_ds] = dict(masking_ds_dict[masking_ds])
            new_cls = MaskingAdapter(masking_masked_dict[masking_ds]['class'],
                                     masking_op,
                                     masking_value)
            masking_masked_dict[masking_ds]['class'] = new_cls

        # use DataManager for reading masking datasets
        masking_dm = DataManager(masking_ds_dict, masking_ids[0],
                                 period=period)
        masking_data = {}
        valid_masking_ids = []
        for mds in masking_ids:
            mdata = masking_dm.read_ds(mds, lon, lat)
            if mdata is not None:
                masking_data[mds] = mdata
                valid_masking_ids.append(mds)
            else:
                masking_data[mds] = pd.DataFrame()
        if len(valid_masking_ids) > 1:
            masking_data = BasicTemporalMatching(window=1.0).combinatory_matcher(
                masking_data, masking_ids[0], n=len(masking_ids))

            if len(masking_data) > 0:
                labels, values = masking_data[
                    masking_data.keys()[0]].to_dygraph_format()
        elif len(valid_masking_ids) == 1:
            masking_data = masking_data[valid_masking_ids[0]]
            labels, values = masking_data.to_dygraph_format()
        else:
            labels = [None]
            values = None

        for i, label in enumerate(labels):
            for mid in masking_meta:
                if masking_meta[mid]['variable']['name'] in label:
                    labels[i] = masking_meta[mid]['long_name']

        masking_data = {'labels': labels, 'data': values}

    ismn_iface = prepare_station_interface(app.config['ISMN_PATH'],
                                           station_id,
                                           "soil moisture",
                                           depth_from, depth_to, sensor_id)

    validation_ds_dict = get_validation_ds_dict()
    validation_ds_dict.update({'ISMN': {'class': ismn_iface,
                                        'columns': ['soil moisture']}})

    if anomaly is not None:
        adapter = {'climatology': AnomalyClimAdapter,
                   'average': AnomalyAdapter}
        for dataset in validation_ds_dict:
            validation_ds_dict[dataset]['class'] = adapter[
                anomaly](validation_ds_dict[dataset]['class'],
                         columns=validation_ds_dict[dataset]['columns'])

    mcalc = BasicMetricsPlusMSE(other_name='k1',
                                calc_tau=True).calc_metrics
    process = Validation(validation_ds_dict, 'ISMN',
                         temporal_ref='cci',
                         scaling=scaling,
                         metrics_calculators={(2, 2): mcalc},
                         masking_datasets=masking_masked_dict,
                         period=period,
                         temporal_window=1)

    df_dict = process.data_manager.get_data(1,
                                            lon,
                                            lat)

    matched_data, result, used_data = process.perform_validation(
        df_dict, (1, lon, lat))

    res_key = list(result)[0]
    data = used_data[res_key]
    result = result[res_key][0]

    # rename data to original names
    rename_dict = {}
    f = lambda x: "k{}".format(x) if x > 0 else 'ref'
    for i, r in enumerate(res_key):
        rename_dict[f(i)] = " ".join(r)

    data.rename(columns=rename_dict, inplace=True)

    labels, values = data.to_dygraph_format()

    validation_datasets = {'labels': labels, 'data': values}

    statistics = {'kendall': {'v': '%.2f' % result['tau'], 'p': '%.4f' % result['p_tau']},
                  'spearman': {'v': '%.2f' % result['rho'], 'p': '%.4f' % result['p_rho']},
                  'pearson': {'v': '%.2f' % result['R'], 'p': '%.4f' % result['p_R']},
                  'bias': '%.4f' % result['BIAS'],
                  'rmsd': {'rmsd': '%.4f' % np.sqrt(result['mse']),
                           'rmsd_corr': '%.4f' % np.sqrt(result['mse_corr']),
                           'rmsd_bias': '%.4f' % np.sqrt(result['mse_bias']),
                           'rmsd_var': '%.4f' % np.sqrt(result['mse_var'])},
                  'mse': {'mse': '%.4f' % result['mse'],
                          'mse_corr': '%.4f' % result['mse_corr'],
                          'mse_bias': '%.4f' % result['mse_bias'],
                          'mse_var': '%.4f' % result['mse_var']}}

    scaling_options = {'noscale': 'No scaling',
                       'porosity': 'Scale using porosity',
                       'linreg': 'Linear Regression',
                       'mean_std': 'Mean - standard deviation',
                       'min_max': 'Minimum,maximum',
                       'lin_cdf_match': 'Piecewise <br> linear CDF matching',
                       'cdf_match': 'CDF matching'}

    if scaling is None:
        scaling = 'noscale'

    masking_option_return = {}
    for mid, mops, mval in zip(masking_ids,
                               masking_ops,
                               masking_values):
        masking_option_return[mid] = {'op': mops,
                                      'val': mval,
                                      'name': masking_meta[mid]['long_name']}

    settings = {'scaling': scaling_options[scaling],
                'masking': masking_option_return}

    output_data = {'validation_data': validation_datasets, 'masking_data': masking_data,
                   'statistics': statistics, 'settings': settings}
    status = 1
    if status == -1:
        data = 'Error'
    else:
        data = jsonify(output_data)

    resp = make_response(data)
    resp.headers['Access-Control-Allow-Origin'] = '*'
    return resp
示例#6
0
def test_temporal_matching_ascat_ismn():
    """
    This test uses a CSV file of ASCAT and ISMN data to test if the temporal
    matching within the validation works as epxected in a "real" setup.
    This only tests whether the number of observations matches, because this is
    the main thing the temporal matching influences.
    """

    # test with ASCAT and ISMN data
    here = Path(__file__).resolve().parent
    ascat = pd.read_csv(here / "ASCAT.csv", index_col=0, parse_dates=True)
    ismn = pd.read_csv(here / "ISMN.csv", index_col=0, parse_dates=True)
    dfs = {"ASCAT": ascat, "ISMN": ismn}
    columns = {"ASCAT": "sm", "ISMN": "soil_moisture"}
    refname = "ISMN"
    window = pd.Timedelta(12, "H")

    old_matcher = BasicTemporalMatching().combinatory_matcher
    new_matcher = make_combined_temporal_matcher(window)

    datasets = {}
    for key in ["ISMN", "ASCAT"]:
        all_columns = list(dfs[key].columns)
        ds = {"columns": [columns[key]], "class": DummyReader(dfs[key], all_columns)}
        datasets[key] = ds

    new_val = Validation(
        datasets,
        refname,
        scaling=None,  # doesn't work with the constant test data
        temporal_matcher=new_matcher,
        metrics_calculators={
            (2, 2): PairwiseIntercomparisonMetrics().calc_metrics
        }
    )
    new_results = new_val.calc(
        1, 1, 1, rename_cols=False, only_with_temporal_ref=True
    )

    # old setup
    ds_names = list(datasets.keys())
    metrics = IntercomparisonMetrics(
        dataset_names=ds_names,
        # passing the names here explicitly, see GH issue #220
        refname=refname,
        other_names=ds_names[1:],
        calc_tau=True,
    )
    old_val = Validation(
        datasets,
        refname,
        scaling=None,  # doesn't work with the constant test data
        temporal_matcher=old_matcher,
        metrics_calculators={
            (2, 2): metrics.calc_metrics
        }
    )
    old_results = old_val.calc(
        1, 1, 1, rename_cols=False
    )

    old_key = (('ASCAT', 'sm'), ('ISMN', 'soil_moisture'))
    new_key = (('ASCAT', 'sm'), ('ISMN', 'soil_moisture'))

    assert old_results[old_key]["n_obs"] == new_results[new_key]["n_obs"]
示例#7
0
def test_PairwiseIntercomparisonMetrics_confidence_intervals():
    # tests if the correct confidence intervals are returned

    datasets, _ = testdata_random()
    matcher = make_combined_temporal_matcher(pd.Timedelta(6, "H"))
    val = Validation(
        datasets,
        "reference_name",
        scaling=None,  # doesn't work with the constant test data
        temporal_matcher=matcher,
        metrics_calculators={
            (4, 2): (
                PairwiseIntercomparisonMetrics(
                    calc_spearman=True,
                    calc_kendall=True,
                    analytical_cis=True,
                    bootstrap_cis=True,
                ).calc_metrics
            )
        }
    )
    results_pw = val.calc(
        [1], [1], [1], rename_cols=False, only_with_temporal_ref=True
    )

    metrics_with_ci = {
        "BIAS": "bias",
        "R": "pearson_r",
        "rho": "spearman_r",
        "tau": "kendall_tau",
        "RMSD": "rmsd",
        "urmsd": "ubrmsd",
        "mse": "msd",
        "mse_bias": "mse_bias",
    }
    metrics_with_bs_ci = {
        "mse_corr": "mse_corr",
        "mse_var": "mse_var",
    }

    # reconstruct dataframe
    frames = []
    for key in datasets:
        frames.append(datasets[key]["class"].data)
    data = pd.concat(frames, axis=1)
    data.dropna(how="any", inplace=True)

    for key in results_pw:
        othername = key[0][0]
        other_col = othername.split("_")[0]
        other = data[other_col].values
        refname = key[1][0]
        ref_col = refname.split("_")[0]
        ref = data[ref_col].values
        for metric_key in metrics_with_ci:
            lower = results_pw[key][f"{metric_key}_ci_lower"]
            upper = results_pw[key][f"{metric_key}_ci_upper"]

            # calculate manually from data
            metric_func = getattr(pairwise, metrics_with_ci[metric_key])
            m, lb, ub = with_analytical_ci(
                metric_func, other, ref
            )
            # difference due to float32 vs. float64
            assert_almost_equal(upper, ub, 6)
            assert_almost_equal(lower, lb, 6)

        for metric_key in metrics_with_bs_ci:
            lower = results_pw[key][f"{metric_key}_ci_lower"]
            upper = results_pw[key][f"{metric_key}_ci_upper"]

            # calculate manually from data
            metric_func = getattr(pairwise, metrics_with_bs_ci[metric_key])
            m, lb, ub = with_bootstrapped_ci(
                metric_func, other, ref
            )
            assert_allclose(upper, ub, rtol=1e-1, atol=1e-4)
            assert_allclose(lower, lb, rtol=1e-1, atol=1e-4)
示例#8
0
def test_ascat_ismn_validation_metadata_rolling():
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    ascat_data_folder = os.path.join(os.path.dirname(__file__), '..',
                                     'test-data', 'sat', 'ascat', 'netcdf',
                                     '55R22')

    ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..',
                                     'test-data', 'sat', 'ascat', 'netcdf',
                                     'grid')

    static_layers_folder = os.path.join(os.path.dirname(__file__), '..',
                                        'test-data', 'sat', 'h_saf',
                                        'static_layer')

    ascat_reader = AscatSsmCdr(ascat_data_folder,
                               ascat_grid_folder,
                               grid_filename='TUW_WARP5_grid_info_2_1.nc',
                               static_layer_path=static_layers_folder)
    ascat_reader.read_bulk = True

    # Initialize ISMN reader

    ismn_data_folder = os.path.join(os.path.dirname(__file__), '..',
                                    'test-data', 'ismn', 'multinetwork',
                                    'header_values')
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(variable='soil moisture',
                                      min_depth=0,
                                      max_depth=0.1)

    metadata_dict_template = {
        'network': np.array(['None'], dtype='U256'),
        'station': np.array(['None'], dtype='U256'),
        'landcover': np.float32([np.nan]),
        'climate': np.array(['None'], dtype='U4')
    }

    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        metadata_dict = [{
            'network': metadata['network'],
            'station': metadata['station'],
            'landcover': metadata['landcover_2010'],
            'climate': metadata['climate']
        }]
        jobs.append(
            (idx, metadata['longitude'], metadata['latitude'], metadata_dict))

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        'ISMN': {
            'class': ismn_reader,
            'columns': ['soil moisture']
        },
        'ASCAT': {
            'class': ascat_reader,
            'columns': ['sm'],
            'kwargs': {
                'mask_frozen_prob': 80,
                'mask_snow_prob': 80,
                'mask_ssf': True
            }
        }
    }

    read_ts_names = {'ASCAT': 'read', 'ISMN': 'read_ts'}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(datasets,
                           'ISMN',
                           period,
                           read_ts_names=read_ts_names)

    process = Validation(
        datasets,
        'ISMN',
        temporal_ref='ASCAT',
        scaling='lin_cdf_match',
        scaling_ref='ASCAT',
        metrics_calculators={
            (2, 2):
            metrics_calculators.RollingMetrics(
                other_name='k1',
                metadata_template=metadata_dict_template).calc_metrics
        },
        period=period)

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results,
                               save_path,
                               ts_vars=['R', 'p_R', 'RMSD'])

    results_fname = os.path.join(save_path,
                                 'ASCAT.sm_with_ISMN.soil moisture.nc')

    vars_should = [
        u'gpi', u'lon', u'lat', u'R', u'p_R', u'time', u'idx', u'_row_size'
    ]

    for key, value in metadata_dict_template.items():
        vars_should.append(key)

    network_should = np.array([
        'MAQU', 'MAQU', 'SCAN', 'SCAN', 'SCAN', 'SOILSCAPE', 'SOILSCAPE',
        'SOILSCAPE'
    ],
                              dtype='U256')

    reader = PointDataResults(results_fname, read_only=True)
    df = reader.read_loc(None)
    nptest.assert_equal(sorted(network_should), sorted(df['network'].values))
    assert np.all(df.gpi.values == np.arange(8))
    assert (reader.read_ts(0).index.size == 357)
    assert np.all(
        reader.read_ts(1).columns.values == np.array(['R', 'p_R', 'RMSD']))
示例#9
0
def test_ascat_ismn_validation():
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    ascat_data_folder = os.path.join(os.path.dirname(__file__), '..',
                                     'test-data', 'sat', 'ascat', 'netcdf',
                                     '55R22')

    ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..',
                                     'test-data', 'sat', 'ascat', 'netcdf',
                                     'grid')

    static_layers_folder = os.path.join(os.path.dirname(__file__), '..',
                                        'test-data', 'sat', 'h_saf',
                                        'static_layer')

    ascat_reader = AscatSsmCdr(ascat_data_folder,
                               ascat_grid_folder,
                               grid_filename='TUW_WARP5_grid_info_2_1.nc',
                               static_layer_path=static_layers_folder)
    ascat_reader.read_bulk = True

    # Initialize ISMN reader

    ismn_data_folder = os.path.join(os.path.dirname(__file__), '..',
                                    'test-data', 'ismn', 'multinetwork',
                                    'header_values')
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(variable='soil moisture',
                                      min_depth=0,
                                      max_depth=0.1)
    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        jobs.append((idx, metadata['longitude'], metadata['latitude']))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        'ISMN': {
            'class': ismn_reader,
            'columns': ['soil moisture']
        },
        'ASCAT': {
            'class': ascat_reader,
            'columns': ['sm'],
            'kwargs': {
                'mask_frozen_prob': 80,
                'mask_snow_prob': 80,
                'mask_ssf': True
            }
        }
    }

    read_ts_names = {'ASCAT': 'read', 'ISMN': 'read_ts'}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(datasets,
                           'ISMN',
                           period,
                           read_ts_names=read_ts_names)

    process = Validation(
        datasets,
        'ISMN',
        temporal_ref='ASCAT',
        scaling='lin_cdf_match',
        scaling_ref='ASCAT',
        metrics_calculators={
            (2, 2):
            metrics_calculators.BasicMetrics(other_name='k1').calc_metrics
        },
        period=period)

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(save_path,
                                 'ASCAT.sm_with_ISMN.soil moisture.nc')

    vars_should = [
        u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau', u'BIAS', u'p_rho',
        u'rho', u'lat', u'R', u'p_R', u'time', u'idx', u'_row_size'
    ]
    n_obs_should = [384, 357, 482, 141, 251, 1927, 1887, 1652]
    rho_should = np.array([
        0.70022893, 0.53934574, 0.69356072, 0.84189808, 0.74206454, 0.30299741,
        0.53143877, 0.62204134
    ],
                          dtype=np.float32)

    rmsd_should = np.array([
        7.72966719, 11.58347607, 14.57700157, 13.06224251, 12.90389824,
        14.24668026, 21.19682884, 17.3883934
    ],
                           dtype=np.float32)
    with nc.Dataset(results_fname, mode='r') as results:
        assert sorted(list(results.variables.keys())) == sorted(vars_should)
        assert sorted(
            results.variables['n_obs'][:].tolist()) == sorted(n_obs_should)
        nptest.assert_allclose(sorted(rho_should),
                               sorted(results.variables['rho'][:]),
                               rtol=1e-4)
        nptest.assert_allclose(sorted(rmsd_should),
                               sorted(results.variables['RMSD'][:]),
                               rtol=1e-4)
示例#10
0
def test_validation_n3_k2():

    tst_results = {
        (('DS1', 'x'), ('DS3', 'y')): {
            'n_obs': np.array([1000], dtype=np.int32),
            'tau': np.array([np.nan], dtype=np.float32),
            'gpi': np.array([4], dtype=np.int32),
            'RMSD': np.array([0.], dtype=np.float32),
            'lon': np.array([4.]),
            'p_tau': np.array([np.nan], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'lat': np.array([4.]),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32)
        },
        (('DS1', 'x'), ('DS2', 'y')): {
            'n_obs': np.array([1000], dtype=np.int32),
            'tau': np.array([np.nan], dtype=np.float32),
            'gpi': np.array([4], dtype=np.int32),
            'RMSD': np.array([0.], dtype=np.float32),
            'lon': np.array([4.]),
            'p_tau': np.array([np.nan], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'lat': np.array([4.]),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32)
        },
        (('DS1', 'x'), ('DS3', 'x')): {
            'n_obs': np.array([1000], dtype=np.int32),
            'tau': np.array([np.nan], dtype=np.float32),
            'gpi': np.array([4], dtype=np.int32),
            'RMSD': np.array([0.], dtype=np.float32),
            'lon': np.array([4.]),
            'p_tau': np.array([np.nan], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'lat': np.array([4.]),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32)
        },
        (('DS2', 'y'), ('DS3', 'x')): {
            'gpi': np.array([4], dtype=np.int32),
            'lon': np.array([4.]),
            'lat': np.array([4.]),
            'n_obs': np.array([1000], dtype=np.int32),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'RMSD': np.array([0.], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'tau': np.array([np.nan], dtype=np.float32),
            'p_tau': np.array([np.nan], dtype=np.float32)
        },
        (('DS2', 'y'), ('DS3', 'y')): {
            'gpi': np.array([4], dtype=np.int32),
            'lon': np.array([4.]),
            'lat': np.array([4.]),
            'n_obs': np.array([1000], dtype=np.int32),
            'R': np.array([1.], dtype=np.float32),
            'p_R': np.array([0.], dtype=np.float32),
            'rho': np.array([1.], dtype=np.float32),
            'p_rho': np.array([0.], dtype=np.float32),
            'RMSD': np.array([0.], dtype=np.float32),
            'BIAS': np.array([0.], dtype=np.float32),
            'tau': np.array([np.nan], dtype=np.float32),
            'p_tau': np.array([np.nan], dtype=np.float32)
        }
    }

    datasets = setup_TestDatasets()
    dm = DataManager(datasets,
                     'DS1',
                     read_ts_names={d: 'read'
                                    for d in ['DS1', 'DS2', 'DS3']})

    process = Validation(
        dm,
        'DS1',
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling='lin_cdf_match',
        metrics_calculators={
            (3, 2):
            metrics_calculators.BasicMetrics(other_name='k1').calc_metrics
        })

    jobs = process.get_processing_jobs()
    for job in jobs:
        results = process.calc(*job)
        assert sorted(list(results)) == sorted(list(tst_results))
示例#11
0
def create_pytesmo_validation(validation_run):
    ds_list = []
    ref_name = None
    scaling_ref_name = None

    ds_num = 1
    for dataset_config in validation_run.dataset_configurations.all():
        reader = create_reader(dataset_config.dataset, dataset_config.version)
        reader = setup_filtering(
            reader, list(dataset_config.filters.all()),
            list(dataset_config.parametrisedfilter_set.all()),
            dataset_config.dataset, dataset_config.variable)

        if validation_run.anomalies == ValidationRun.MOVING_AVG_35_D:
            reader = AnomalyAdapter(
                reader,
                window_size=35,
                columns=[dataset_config.variable.pretty_name])
        if validation_run.anomalies == ValidationRun.CLIMATOLOGY:
            # make sure our baseline period is in UTC and without timezone information
            anomalies_baseline = [
                validation_run.anomalies_from.astimezone(tz=pytz.UTC).replace(
                    tzinfo=None),
                validation_run.anomalies_to.astimezone(tz=pytz.UTC).replace(
                    tzinfo=None)
            ]
            reader = AnomalyClimAdapter(
                reader,
                columns=[dataset_config.variable.pretty_name],
                timespan=anomalies_baseline)

        if ((validation_run.reference_configuration) and
            (dataset_config.id == validation_run.reference_configuration.id)):
            # reference is always named "0-..."
            dataset_name = '{}-{}'.format(0, dataset_config.dataset.short_name)
        else:
            dataset_name = '{}-{}'.format(ds_num,
                                          dataset_config.dataset.short_name)
            ds_num += 1

        ds_list.append((dataset_name, {
            'class': reader,
            'columns': [dataset_config.variable.pretty_name]
        }))

        if ((validation_run.reference_configuration) and
            (dataset_config.id == validation_run.reference_configuration.id)):
            ref_name = dataset_name
        if ((validation_run.scaling_ref)
                and (dataset_config.id == validation_run.scaling_ref.id)):
            scaling_ref_name = dataset_name

    datasets = dict(ds_list)
    ds_num = len(ds_list)

    period = None
    if validation_run.interval_from is not None and validation_run.interval_to is not None:
        ## while pytesmo can't deal with timezones, normalise the validation period to utc; can be removed once pytesmo can do timezones
        startdate = validation_run.interval_from.astimezone(UTC).replace(
            tzinfo=None)
        enddate = validation_run.interval_to.astimezone(UTC).replace(
            tzinfo=None)
        period = [startdate, enddate]

    datamanager = DataManager(datasets,
                              ref_name=ref_name,
                              period=period,
                              read_ts_names='read')
    ds_names = get_dataset_names(datamanager.reference_name,
                                 datamanager.datasets,
                                 n=ds_num)

    if (len(ds_names) >= 3) and (validation_run.tcol is True):
        # if there are 3 or more dataset, do TC, exclude ref metrics
        metrics = TCMetrics(
            dataset_names=ds_names,
            tc_metrics_for_ref=False,
            other_names=['k{}'.format(i + 1) for i in range(ds_num - 1)])
    else:
        metrics = IntercomparisonMetrics(
            dataset_names=ds_names,
            other_names=['k{}'.format(i + 1) for i in range(ds_num - 1)])

    if validation_run.scaling_method == validation_run.NO_SCALING:
        scaling_method = None
    else:
        scaling_method = validation_run.scaling_method

    __logger.debug(f"Scaling method: {scaling_method}")
    __logger.debug(f"Scaling dataset: {scaling_ref_name}")

    val = Validation(datasets=datamanager,
                     spatial_ref=ref_name,
                     temporal_window=0.5,
                     scaling=scaling_method,
                     scaling_ref=scaling_ref_name,
                     metrics_calculators={
                         (ds_num, ds_num): metrics.calc_metrics
                     },
                     period=period)

    return val
示例#12
0
def test_ascat_ismn_validation():
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data',
                                     'sat', 'ascat', 'netcdf', '55R22')

    ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data',
                                     'sat', 'ascat', 'netcdf', 'grid')

    ascat_reader = AscatH25_SSM(ascat_data_folder, ascat_grid_folder)
    ascat_reader.read_bulk = True
    ascat_reader._load_grid_info()

    # Initialize ISMN reader

    ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data',
                                    'ismn', 'multinetwork', 'header_values')
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(
        variable='soil moisture',
        min_depth=0,
        max_depth=0.1)
    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        jobs.append((idx, metadata['longitude'], metadata['latitude']))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        'ISMN': {
            'class': ismn_reader,
            'columns': ['soil moisture']
        },
        'ASCAT': {
            'class': ascat_reader,
            'columns': ['sm'],
            'kwargs': {'mask_frozen_prob': 80,
                       'mask_snow_prob': 80,
                       'mask_ssf': True}
        }}

    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    process = Validation(
        datasets, 'ISMN',
        temporal_ref='ASCAT',
        scaling='lin_cdf_match',
        scaling_ref='ASCAT',
        metrics_calculators={
            (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics},
        period=period)

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(
        save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc')

    vars_should = [u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau',
                   u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R']
    n_obs_should = [360, 385, 1644, 1881, 1927, 479, 140, 251]
    rho_should = np.array([0.546187,
                           0.717398,
                           0.620892,
                           0.532465,
                           0.302997,
                           0.694713,
                           0.840592,
                           0.742065],
                          dtype=np.float32)

    rmsd_should = np.array([11.536263,
                            7.545650,
                            17.451935,
                            21.193714,
                            14.246680,
                            14.494674,
                            13.173215,
                            12.903898],
                           dtype=np.float32)
    with nc.Dataset(results_fname) as results:
        assert sorted(results.variables.keys()) == sorted(vars_should)
        assert sorted(results.variables['n_obs'][:].tolist()) == sorted(
            n_obs_should)
        nptest.assert_allclose(sorted(rho_should),
                               sorted(results.variables['rho'][:]),
                               rtol=1e-4)
        nptest.assert_allclose(sorted(rmsd_should),
                               sorted(results.variables['RMSD'][:]),
                               rtol=1e-4)
示例#13
0
def test_validation_n3_k2_masking():

    # test result for one gpi in a cell
    tst_results_one = {
        (('DS1', 'x'), ('DS3', 'y')): {
            'n_obs': np.array([250], dtype=np.int32)},
        (('DS1', 'x'), ('DS2', 'y')): {
            'n_obs': np.array([250], dtype=np.int32)},
        (('DS1', 'x'), ('DS3', 'x')): {
            'n_obs': np.array([250], dtype=np.int32)}}

    # test result for two gpis in a cell
    tst_results_two = {
        (('DS1', 'x'), ('DS3', 'y')): {
            'n_obs': np.array([250, 250], dtype=np.int32)},
        (('DS1', 'x'), ('DS2', 'y')): {
            'n_obs': np.array([250, 250], dtype=np.int32)},
        (('DS1', 'x'), ('DS3', 'x')): {
            'n_obs': np.array([250, 250], dtype=np.int32)}}

    # cell 4 in this example has two gpis so it returns different results.
    tst_results = {1: tst_results_one,
                   1: tst_results_one,
                   2: tst_results_two}

    datasets = setup_TestDatasets()

    # setup masking datasets

    grid = grids.CellGrid(np.array([1, 2, 3, 4]), np.array([1, 2, 3, 4]),
                          np.array([4, 4, 2, 1]), gpis=np.array([1, 2, 3, 4]))

    mds1 = GriddedTsBase("", grid, MaskingTestDataset)
    mds2 = GriddedTsBase("", grid, MaskingTestDataset)

    mds = {
        'masking1': {
            'class': mds1,
            'columns': ['x'],
            'args': [],
            'kwargs': {'limit': 500},
            'use_lut': False,
            'grids_compatible': True},
        'masking2': {
            'class': mds2,
            'columns': ['x'],
            'args': [],
            'kwargs': {'limit': 750},
            'use_lut': False,
            'grids_compatible': True}
    }

    process = Validation(
        datasets, 'DS1',
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling='lin_cdf_match',
        metrics_calculators={
            (3, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics},
        masking_datasets=mds)

    gpi_info = (1, 1, 1)
    ref_df = datasets['DS1']['class'].read_ts(1)
    new_ref_df = process.mask_dataset(ref_df, gpi_info)
    assert len(new_ref_df) == 250
    nptest.assert_allclose(new_ref_df.x.values, np.arange(750, 1000))
    jobs = process.get_processing_jobs()
    for job in jobs:
        results = process.calc(*job)
        tst = tst_results[len(job[0])]
        assert sorted(list(results)) == sorted(list(tst))
        for key, tst_key in zip(sorted(results),
                                sorted(tst)):
            nptest.assert_almost_equal(results[key]['n_obs'],
                                       tst[tst_key]['n_obs'])
示例#14
0
def create_pytesmo_validation(validation_run):
    ds_list = []
    ref_name = None
    scaling_ref_name = None

    ds_num = 1
    for dataset_config in validation_run.dataset_configurations.all():
        reader = create_reader(dataset_config.dataset, dataset_config.version)
        reader = setup_filtering(
            reader, list(dataset_config.filters.all()),
            list(dataset_config.parametrisedfilter_set.all()),
            dataset_config.dataset, dataset_config.variable)

        if validation_run.anomalies == ValidationRun.MOVING_AVG_35_D:
            reader = AnomalyAdapter(
                reader,
                window_size=35,
                columns=[dataset_config.variable.pretty_name])
        if validation_run.anomalies == ValidationRun.CLIMATOLOGY:
            # make sure our baseline period is in UTC and without timezone information
            anomalies_baseline = [
                validation_run.anomalies_from.astimezone(tz=pytz.UTC).replace(
                    tzinfo=None),
                validation_run.anomalies_to.astimezone(tz=pytz.UTC).replace(
                    tzinfo=None)
            ]
            reader = AnomalyClimAdapter(
                reader,
                columns=[dataset_config.variable.pretty_name],
                timespan=anomalies_baseline)

        if (validation_run.reference_configuration and
            (dataset_config.id == validation_run.reference_configuration.id)):
            # reference is always named "0-..."
            dataset_name = '{}-{}'.format(0, dataset_config.dataset.short_name)
        else:
            dataset_name = '{}-{}'.format(ds_num,
                                          dataset_config.dataset.short_name)
            ds_num += 1

        ds_list.append((dataset_name, {
            'class': reader,
            'columns': [dataset_config.variable.pretty_name]
        }))

        if (validation_run.reference_configuration and
            (dataset_config.id == validation_run.reference_configuration.id)):
            ref_name = dataset_name
            ref_short_name = validation_run.reference_configuration.dataset.short_name

        if (validation_run.scaling_ref
                and (dataset_config.id == validation_run.scaling_ref.id)):
            scaling_ref_name = dataset_name

    datasets = dict(ds_list)
    ds_num = len(ds_list)

    period = None
    if validation_run.interval_from is not None and validation_run.interval_to is not None:
        # while pytesmo can't deal with timezones, normalise the validation period to utc; can be removed once pytesmo can do timezones
        startdate = validation_run.interval_from.astimezone(UTC).replace(
            tzinfo=None)
        enddate = validation_run.interval_to.astimezone(UTC).replace(
            tzinfo=None)
        period = [startdate, enddate]

    upscale_parms = None
    if validation_run.upscaling_method != "none":
        __logger.debug("Upscaling option is active")
        upscale_parms = {
            "upscaling_method": validation_run.upscaling_method,
            "temporal_stability": validation_run.temporal_stability,
        }
        upscaling_lut = create_upscaling_lut(
            validation_run=validation_run,
            datasets=datasets,
            ref_name=ref_name,
        )
        upscale_parms["upscaling_lut"] = upscaling_lut
        __logger.debug("Lookup table for non-reference datasets " +
                       ", ".join(upscaling_lut.keys()) + " created")
        __logger.debug("{}".format(upscaling_lut))

    datamanager = DataManager(
        datasets,
        ref_name=ref_name,
        period=period,
        read_ts_names='read',
        upscale_parms=upscale_parms,
    )
    ds_names = get_dataset_names(datamanager.reference_name,
                                 datamanager.datasets,
                                 n=ds_num)

    # set value of the metadata template according to what reference dataset is used
    if ref_short_name == 'ISMN':
        metadata_template = METADATA_TEMPLATE['ismn_ref']
    else:
        metadata_template = METADATA_TEMPLATE['other_ref']

    pairwise_metrics = PairwiseIntercomparisonMetrics(
        metadata_template=metadata_template,
        calc_kendall=False,
    )

    metric_calculators = {(ds_num, 2): pairwise_metrics.calc_metrics}

    if (len(ds_names) >= 3) and (validation_run.tcol is True):
        tcol_metrics = TripleCollocationMetrics(
            ref_name,
            metadata_template=metadata_template,
            bootstrap_cis=validation_run.bootstrap_tcol_cis)
        metric_calculators.update({(ds_num, 3): tcol_metrics.calc_metrics})

    if validation_run.scaling_method == validation_run.NO_SCALING:
        scaling_method = None
    else:
        scaling_method = validation_run.scaling_method

    __logger.debug(f"Scaling method: {scaling_method}")
    __logger.debug(f"Scaling dataset: {scaling_ref_name}")

    val = Validation(datasets=datamanager,
                     temporal_matcher=make_combined_temporal_matcher(
                         pd.Timedelta(12, "H")),
                     spatial_ref=ref_name,
                     scaling=scaling_method,
                     scaling_ref=scaling_ref_name,
                     metrics_calculators=metric_calculators,
                     period=period)

    return val
示例#15
0
def test_validation_n3_k2_masking_no_data_remains():
    datasets = setup_TestDatasets()

    # setup masking datasets

    grid = grids.CellGrid(
        np.array([1, 2, 3, 4]),
        np.array([1, 2, 3, 4]),
        np.array([4, 4, 2, 1]),
        gpis=np.array([1, 2, 3, 4]),
    )

    mds1 = GriddedTsBase("", grid, MaskingTestDataset)
    mds2 = GriddedTsBase("", grid, MaskingTestDataset)

    mds = {
        "masking1": {
            "class": mds1,
            "columns": ["x"],
            "args": [],
            "kwargs": {
                "limit": 500
            },
            "use_lut": False,
            "grids_compatible": True,
        },
        "masking2": {
            "class": mds2,
            "columns": ["x"],
            "args": [],
            "kwargs": {
                "limit": 1000
            },
            "use_lut": False,
            "grids_compatible": True,
        },
    }

    process = Validation(
        datasets,
        "DS1",
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling="lin_cdf_match",
        metrics_calculators={
            (3, 2):
            metrics_calculators.BasicMetrics(other_name="k1").calc_metrics
        },
        masking_datasets=mds,
    )

    gpi_info = (1, 1, 1)
    ref_df = datasets["DS1"]["class"].read(1)
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        new_ref_df = process.mask_dataset(ref_df, gpi_info)
    assert len(new_ref_df) == 0
    nptest.assert_allclose(new_ref_df.x.values, np.arange(1000, 1000))
    jobs = process.get_processing_jobs()
    for job in jobs:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=DeprecationWarning)
            results = process.calc(*job)
        tst = []
        assert sorted(list(results)) == sorted(list(tst))
        for key, tst_key in zip(sorted(results), sorted(tst)):
            nptest.assert_almost_equal(results[key]["n_obs"],
                                       tst[tst_key]["n_obs"])
示例#16
0
def test_ascat_ismn_validation():
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    ascat_data_folder = os.path.join(os.path.dirname(__file__), 'test-data',
                                     'sat', 'ascat', 'netcdf', '55R22')

    ascat_grid_folder = os.path.join(os.path.dirname(__file__), 'test-data',
                                     'sat', 'ascat', 'netcdf', 'grid')

    ascat_reader = AscatH25_SSM(ascat_data_folder, ascat_grid_folder)
    ascat_reader.read_bulk = True
    ascat_reader._load_grid_info()

    # Initialize ISMN reader

    ismn_data_folder = os.path.join(os.path.dirname(__file__), 'test-data',
                                    'ismn', 'multinetwork', 'header_values')
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(
        variable='soil moisture',
        min_depth=0,
        max_depth=0.1)
    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        jobs.append((idx, metadata['longitude'], metadata['latitude']))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        'ISMN': {
            'class': ismn_reader, 'columns': [
                'soil moisture'
            ], 'type': 'reference', 'args': [], 'kwargs': {}
        },
        'ASCAT': {
            'class': ascat_reader, 'columns': [
                'sm'
            ], 'type': 'other', 'args': [], 'kwargs': {}, 'grids_compatible':
            False, 'use_lut': False, 'lut_max_dist': 30000
        }
    }

    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    process = Validation(
        datasets=datasets,
        data_prep=DataPreparation(),
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0,
            reverse=True),
        scaling='lin_cdf_match',
        scale_to_other=True,
        metrics_calculator=metrics_calculators.BasicMetrics(),
        period=period,
        cell_based_jobs=False)

    for job in jobs:
        results = process.calc(job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(
        save_path, 'ISMN.soil moisture_with_ASCAT.sm.nc')

    vars_should = [u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau',
                   u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R']
    n_obs_should = [360, 385, 1644, 1881, 1927, 479, 140, 251]
    rho_should = np.array([0.54618734, 0.71739876, 0.62089276, 0.53246528,
                           0.30299741, 0.69647062, 0.840593, 0.73913699],
                          dtype=np.float32)

    rmsd_should = np.array([11.53626347, 7.54565048, 17.45193481, 21.19371414,
                            14.24668026, 14.27493, 13.173215, 12.59192371],
                           dtype=np.float32)
    with nc.Dataset(results_fname) as results:
        assert sorted(results.variables.keys()) == sorted(vars_should)
        assert sorted(results.variables['n_obs'][:].tolist()) == sorted(
            n_obs_should)
        nptest.assert_allclose(sorted(rho_should),
                               sorted(results.variables['rho'][:]))
        nptest.assert_allclose(sorted(rmsd_should),
                               sorted(results.variables['RMSD'][:]))
示例#17
0
def test_validation_n3_k2_masking():
    # test result for one gpi in a cell
    tst_results_one = {
        (("DS1", "x"), ("DS3", "y")): {
            "n_obs": np.array([250], dtype=np.int32)
        },
        (("DS1", "x"), ("DS2", "y")): {
            "n_obs": np.array([250], dtype=np.int32)
        },
        (("DS1", "x"), ("DS3", "x")): {
            "n_obs": np.array([250], dtype=np.int32)
        },
        (("DS2", "y"), ("DS3", "x")): {
            "n_obs": np.array([250], dtype=np.int32)
        },
        (("DS2", "y"), ("DS3", "y")): {
            "n_obs": np.array([250], dtype=np.int32)
        },
    }

    # test result for two gpis in a cell
    tst_results_two = {
        (("DS1", "x"), ("DS3", "y")): {
            "n_obs": np.array([250, 250], dtype=np.int32)
        },
        (("DS1", "x"), ("DS2", "y")): {
            "n_obs": np.array([250, 250], dtype=np.int32)
        },
        (("DS1", "x"), ("DS3", "x")): {
            "n_obs": np.array([250, 250], dtype=np.int32)
        },
        (("DS2", "y"), ("DS3", "x")): {
            "n_obs": np.array([250, 250], dtype=np.int32)
        },
        (("DS2", "y"), ("DS3", "y")): {
            "n_obs": np.array([250, 250], dtype=np.int32)
        },
    }

    # cell 4 in this example has two gpis so it returns different results.
    tst_results = {1: tst_results_one, 1: tst_results_one, 2: tst_results_two}

    datasets = setup_TestDatasets()

    # setup masking datasets

    grid = grids.CellGrid(
        np.array([1, 2, 3, 4]),
        np.array([1, 2, 3, 4]),
        np.array([4, 4, 2, 1]),
        gpis=np.array([1, 2, 3, 4]),
    )

    mds1 = GriddedTsBase("", grid, MaskingTestDataset)
    mds2 = GriddedTsBase("", grid, MaskingTestDataset)

    mds = {
        "masking1": {
            "class": mds1,
            "columns": ["x"],
            "args": [],
            "kwargs": {
                "limit": 500
            },
            "use_lut": False,
            "grids_compatible": True,
        },
        "masking2": {
            "class": mds2,
            "columns": ["x"],
            "args": [],
            "kwargs": {
                "limit": 750
            },
            "use_lut": False,
            "grids_compatible": True,
        },
    }

    process = Validation(
        datasets,
        "DS1",
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling="lin_cdf_match",
        metrics_calculators={
            (3, 2):
            metrics_calculators.BasicMetrics(other_name="k1").calc_metrics
        },
        masking_datasets=mds,
    )

    gpi_info = (1, 1, 1)
    ref_df = datasets["DS1"]["class"].read(1)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=DeprecationWarning
                              )  # read_ts is hard coded when using mask_data
        new_ref_df = process.mask_dataset(ref_df, gpi_info)
    assert len(new_ref_df) == 250
    nptest.assert_allclose(new_ref_df.x.values, np.arange(750, 1000))
    jobs = process.get_processing_jobs()
    for job in jobs:

        with warnings.catch_warnings():
            # most warnings here are caused by the read_ts function that cannot
            # be changed when using a masking data set
            warnings.simplefilter("ignore", category=DeprecationWarning)
            results = process.calc(*job)

        tst = tst_results[len(job[0])]
        assert sorted(list(results)) == sorted(list(tst))
        for key, tst_key in zip(sorted(results), sorted(tst)):
            nptest.assert_almost_equal(results[key]["n_obs"],
                                       tst[tst_key]["n_obs"])
示例#18
0
                      'kwargs': {'mask_frozen_prob': 80,
                                 'mask_snow_prob': 80,
                                 'mask_ssf': True}}
           }


# The datasets dictionary contains all the information about the datasets to read. The `class` is the dataset class to use which we have already initialized. The `columns` key describes which columns of the dataset interest us for validation. This a mandatory field telling the framework which other columns to ignore. In this case the columns `soil moisture_flag` and `soil moisture_orig_flag` will be ignored by the ISMN reader. We can also specify additional keywords that should be given to the `read_ts` method of the dataset reader. In this case we want the ASCAT reader to mask the ASCAT soil moisture using the included frozen and snow probabilities as well as the SSF. There are also other keys that can be used here. Please see the documentation for explanations.

# In[13]:

period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]
basic_metrics = metrics_calculators.BasicMetrics(other_name='k1')

process = Validation(
    datasets, 'ISMN', {(2, 2): basic_metrics.calc_metrics},
    temporal_ref='ASCAT',
    scaling='lin_cdf_match',
    scaling_ref='ASCAT',   
    period=period)


# During the initialization of the Validation class we can also tell it other things that it needs to know. In this case it uses the datasets we have specified earlier. The spatial reference is the `'ISMN'` dataset which is the second argument. The third argument looks a little bit strange so let's look at it in more detail.
# 
# It is a dictionary with a tuple as the key and a function as the value. The key tuple `(n, k)` has the following meaning: `n` datasets are temporally matched together and then given in sets of `k` columns to the metric calculator. The metric calculator then gets a DataFrame with the columns ['ref', 'k1', 'k2' ...] and so on depending on the value of k. The value of `(2, 2)` makes sense here since we only have two datasets and all our metrics also take two inputs. 
# 
# This can be used in more complex scenarios to e.g. have three input datasets that are all temporally matched together and then combinations of two input datasets are given to one metric calculator while all three datasets are given to another metric calculator. This could look like this:
# 
# ```python
# { (3 ,2): metric_calc,
#   (3, 3): triple_collocation}
# ```
示例#19
0
def test_validation_n3_k2_masking_no_data_remains():

    datasets = setup_TestDatasets()

    # setup masking datasets

    grid = grids.CellGrid(np.array([1, 2, 3, 4]),
                          np.array([1, 2, 3, 4]),
                          np.array([4, 4, 2, 1]),
                          gpis=np.array([1, 2, 3, 4]))

    mds1 = GriddedTsBase("", grid, MaskingTestDataset)
    mds2 = GriddedTsBase("", grid, MaskingTestDataset)

    mds = {
        'masking1': {
            'class': mds1,
            'columns': ['x'],
            'args': [],
            'kwargs': {
                'limit': 500
            },
            'use_lut': False,
            'grids_compatible': True
        },
        'masking2': {
            'class': mds2,
            'columns': ['x'],
            'args': [],
            'kwargs': {
                'limit': 1000
            },
            'use_lut': False,
            'grids_compatible': True
        }
    }

    process = Validation(
        datasets,
        'DS1',
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling='lin_cdf_match',
        metrics_calculators={
            (3, 2):
            metrics_calculators.BasicMetrics(other_name='k1').calc_metrics
        },
        masking_datasets=mds)

    gpi_info = (1, 1, 1)
    ref_df = datasets['DS1']['class'].read(1)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=DeprecationWarning)
        new_ref_df = process.mask_dataset(ref_df, gpi_info)
    assert len(new_ref_df) == 0
    nptest.assert_allclose(new_ref_df.x.values, np.arange(1000, 1000))
    jobs = process.get_processing_jobs()
    for job in jobs:
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=DeprecationWarning)
            results = process.calc(*job)
        tst = []
        assert sorted(list(results)) == sorted(list(tst))
        for key, tst_key in zip(sorted(results), sorted(tst)):
            nptest.assert_almost_equal(results[key]['n_obs'],
                                       tst[tst_key]['n_obs'])
示例#20
0
def test_ascat_ismn_validation_metadata_rolling(ascat_reader, ismn_reader):
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    jobs = []

    ids = ismn_reader.get_dataset_ids(variable="soil moisture",
                                      min_depth=0,
                                      max_depth=0.1)

    metadata_dict_template = {
        "network": np.array(["None"], dtype="U256"),
        "station": np.array(["None"], dtype="U256"),
        "landcover": np.float32([np.nan]),
        "climate": np.array(["None"], dtype="U4"),
    }

    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        metadata_dict = [{
            "network": metadata["network"],
            "station": metadata["station"],
            "landcover": metadata["landcover_2010"],
            "climate": metadata["climate"],
        }]
        jobs.append(
            (idx, metadata["longitude"], metadata["latitude"], metadata_dict))

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ISMN": {
            "class": ismn_reader,
            "columns": ["soil moisture"]
        },
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            },
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(datasets,
                           "ISMN",
                           period,
                           read_ts_names=read_ts_names)

    process = Validation(
        datasets,
        "ISMN",
        temporal_ref="ASCAT",
        scaling="lin_cdf_match",
        scaling_ref="ASCAT",
        metrics_calculators={
            (2, 2):
            metrics_calculators.RollingMetrics(
                other_name="k1",
                metadata_template=metadata_dict_template).calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results,
                               save_path,
                               ts_vars=["R", "p_R", "RMSD"])

    results_fname = os.path.join(save_path,
                                 "ASCAT.sm_with_ISMN.soil moisture.nc")

    target_vars = {
        "network":
        np.array(
            [
                "MAQU",
                "MAQU",
                "SCAN",
                "SCAN",
                "SCAN",
                "SOILSCAPE",
                "SOILSCAPE",
                "SOILSCAPE",
            ],
            dtype="U256",
        )
    }
    vars_should = [
        u"gpi", u"RMSD", u"lon", u"lat", u"R", u"p_R", u"time", u"idx",
        u"_row_size"
    ]
    for key, value in metadata_dict_template.items():
        vars_should.append(key)

    check_results(filename=results_fname,
                  target_vars=target_vars,
                  variables=vars_should)

    reader = PointDataResults(results_fname, read_only=True)
    df = reader.read_loc(None)
    assert np.all(df.gpi.values == np.arange(8))
    assert reader.read_ts(0).index.size == 357
    assert np.all(
        reader.read_ts(1).columns.values == np.array(["R", "p_R", "RMSD"]))
示例#21
0
def test_validation_n3_k2_masking():

    # test result for one gpi in a cell
    tst_results_one = {
        (('DS1', 'x'), ('DS3', 'y')): {
            'n_obs': np.array([250], dtype=np.int32)
        },
        (('DS1', 'x'), ('DS2', 'y')): {
            'n_obs': np.array([250], dtype=np.int32)
        },
        (('DS1', 'x'), ('DS3', 'x')): {
            'n_obs': np.array([250], dtype=np.int32)
        },
        (('DS2', 'y'), ('DS3', 'x')): {
            'n_obs': np.array([250], dtype=np.int32)
        },
        (('DS2', 'y'), ('DS3', 'y')): {
            'n_obs': np.array([250], dtype=np.int32)
        }
    }

    # test result for two gpis in a cell
    tst_results_two = {
        (('DS1', 'x'), ('DS3', 'y')): {
            'n_obs': np.array([250, 250], dtype=np.int32)
        },
        (('DS1', 'x'), ('DS2', 'y')): {
            'n_obs': np.array([250, 250], dtype=np.int32)
        },
        (('DS1', 'x'), ('DS3', 'x')): {
            'n_obs': np.array([250, 250], dtype=np.int32)
        },
        (('DS2', 'y'), ('DS3', 'x')): {
            'n_obs': np.array([250, 250], dtype=np.int32)
        },
        (('DS2', 'y'), ('DS3', 'y')): {
            'n_obs': np.array([250, 250], dtype=np.int32)
        }
    }

    # cell 4 in this example has two gpis so it returns different results.
    tst_results = {1: tst_results_one, 1: tst_results_one, 2: tst_results_two}

    datasets = setup_TestDatasets()

    # setup masking datasets

    grid = grids.CellGrid(np.array([1, 2, 3, 4]),
                          np.array([1, 2, 3, 4]),
                          np.array([4, 4, 2, 1]),
                          gpis=np.array([1, 2, 3, 4]))

    mds1 = GriddedTsBase("", grid, MaskingTestDataset)
    mds2 = GriddedTsBase("", grid, MaskingTestDataset)

    mds = {
        'masking1': {
            'class': mds1,
            'columns': ['x'],
            'args': [],
            'kwargs': {
                'limit': 500
            },
            'use_lut': False,
            'grids_compatible': True
        },
        'masking2': {
            'class': mds2,
            'columns': ['x'],
            'args': [],
            'kwargs': {
                'limit': 750
            },
            'use_lut': False,
            'grids_compatible': True
        }
    }

    process = Validation(
        datasets,
        'DS1',
        temporal_matcher=temporal_matchers.BasicTemporalMatching(
            window=1 / 24.0).combinatory_matcher,
        scaling='lin_cdf_match',
        metrics_calculators={
            (3, 2):
            metrics_calculators.BasicMetrics(other_name='k1').calc_metrics
        },
        masking_datasets=mds)

    gpi_info = (1, 1, 1)
    ref_df = datasets['DS1']['class'].read(1)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=DeprecationWarning
                              )  # read_ts is hard coded when using mask_data
        new_ref_df = process.mask_dataset(ref_df, gpi_info)
    assert len(new_ref_df) == 250
    nptest.assert_allclose(new_ref_df.x.values, np.arange(750, 1000))
    jobs = process.get_processing_jobs()
    for job in jobs:

        with warnings.catch_warnings():
            # most warnings here are caused by the read_ts function that cannot
            # be changed when using a masking data set
            warnings.simplefilter('ignore', category=DeprecationWarning)
            results = process.calc(*job)

        tst = tst_results[len(job[0])]
        assert sorted(list(results)) == sorted(list(tst))
        for key, tst_key in zip(sorted(results), sorted(tst)):
            nptest.assert_almost_equal(results[key]['n_obs'],
                                       tst[tst_key]['n_obs'])
示例#22
0
def test_ascat_ismn_validation(ascat_reader, ismn_reader):
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    jobs = []

    ids = ismn_reader.get_dataset_ids(variable="soil moisture",
                                      min_depth=0,
                                      max_depth=0.1)
    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        jobs.append((idx, metadata["longitude"], metadata["latitude"]))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ISMN": {
            "class": ismn_reader,
            "columns": ["soil moisture"]
        },
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            },
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(datasets,
                           "ISMN",
                           period,
                           read_ts_names=read_ts_names)

    process = Validation(
        datasets,
        "ISMN",
        temporal_ref="ASCAT",
        scaling="lin_cdf_match",
        scaling_ref="ASCAT",
        metrics_calculators={
            (2, 2):
            metrics_calculators.BasicMetrics(other_name="k1").calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(save_path,
                                 "ASCAT.sm_with_ISMN.soil moisture.nc")
    # targets
    target_vars = {
        "n_obs": [357, 384, 1646, 1875, 1915, 467, 141, 251],
        "rho":
        np.array([
            0.53934574, 0.7002289, 0.62200236, 0.53647155, 0.30413666,
            0.6740655, 0.8418981, 0.74206454
        ],
                 dtype=np.float32),
        "RMSD":
        np.array([
            11.583476, 7.729667, 17.441547, 21.125721, 14.31557, 14.187225,
            13.0622425, 12.903898
        ],
                 dtype=np.float32)
    }

    check_results(
        filename=results_fname,
        target_vars=target_vars,
    )
示例#23
0
def test_PairwiseIntercomparisonMetrics(testdata_generator):
    # This test first compares the PairwiseIntercomparisonMetrics to known
    # results and then confirms that it agrees with IntercomparisonMetrics as
    # expected

    datasets, expected = testdata_generator()

    # for the pairwise intercomparison metrics it's important that we use
    # make_combined_temporal_matcher
    val = Validation(
        datasets,
        "reference_name",
        scaling=None,  # doesn't work with the constant test data
        temporal_matcher=make_combined_temporal_matcher(pd.Timedelta(6, "H")),
        metrics_calculators={
            (4, 2): (
                PairwiseIntercomparisonMetrics(
                    calc_spearman=True, analytical_cis=False
                ).calc_metrics
            )
        }
    )
    results_pw = val.calc(
        [1], [1], [1], rename_cols=False, only_with_temporal_ref=True
    )

    # in results_pw, there are four entries with keys (("c1name", "c1"),
    # ("refname", "ref"), and so on.
    # Each value is a single dictionary with the values of the metrics

    expected_metrics = [
        "R", "p_R", "BIAS", "RMSD", "mse", "RSS", "mse_corr", "mse_bias",
        "urmsd", "mse_var", "n_obs", "gpi", "lat", "lon", "rho", "p_rho",
        "tau", "p_tau"
    ]
    for key in results_pw:
        assert isinstance(key, tuple)
        assert len(key) == 2
        assert all(map(lambda x: isinstance(x, tuple), key))
        assert isinstance(results_pw[key], dict)
        assert sorted(expected_metrics) == sorted(results_pw[key].keys())
        for m in expected_metrics:
            if m in expected[key]:
                assert_equal(results_pw[key][m], expected[key][m])

    # preparation of IntercomparisonMetrics run for comparison
    ds_names = list(datasets.keys())
    metrics = IntercomparisonMetrics(
        dataset_names=ds_names,
        # passing the names here explicitly, see GH issue #220
        refname="reference_name",
        other_names=ds_names[1:],
        calc_tau=True,
    )
    val = Validation(
        datasets,
        "reference_name",
        scaling=None,
        temporal_matcher=None,  # use default here
        metrics_calculators={(4, 4): metrics.calc_metrics}
    )

    print("running old setup")
    results = val.calc(1, 1, 1, rename_cols=False)

    # results is a dictionary with one entry and key
    # (('c1name', 'c1'), ('c2name', 'c2'), ('c3name', 'c3'), ('refname',
    # 'ref')), the value is a list of length 0, which contains a dictionary
    # with all the results, where the metrics are joined with "_between_" with
    # the combination of datasets, which is joined with "_and_", e.g. for R
    # between ``refname`` and ``c1name`` the key is
    # "R_between_refname_and_c1name"
    common_metrics = ["n_obs", "gpi", "lat", "lon"]
    pw_metrics = list(set(expected_metrics) - set(common_metrics))
    # there's some sorting done at some point in pytesmo
    oldkey = tuple(sorted([(name, name.split("_")[0]) for name in ds_names]))
    res_old = results[oldkey]
    for key in results_pw:
        res = results_pw[key]
        # handle the full dataset metrics
        for m in common_metrics:
            assert_equal(res[m], res_old[m])
        # now get the metrics and compare to the right combination
        for m in pw_metrics:
            othername = key[0][0]
            refname = key[1][0]
            if othername == "reference_name":
                # sorting might be different, see GH #220
                othername = key[1][0]
                refname = key[0][0]
            old_m_key = f"{m}_between_{refname}_and_{othername}"
            if m == "BIAS":
                # PairwiseIntercomparisonMetrics has the result as (other,
                # ref), and therefore "bias between other and ref", compared to
                # "bias between ref and bias" in IntercomparisonMetrics
                # this is related to issue #220
                assert_equal(np.abs(res[m]), np.abs(res_old[old_m_key]))
            elif m == "urmsd":
                # the old implementation differs from the new implementation
                pass
            else:
                assert_equal(res[m], res_old[old_m_key])
示例#24
0
def test_ascat_ismn_validation_metadata(ascat_reader, ismn_reader):
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    jobs = []

    ids = ismn_reader.get_dataset_ids(variable="soil moisture",
                                      min_depth=0,
                                      max_depth=0.1)

    metadata_dict_template = {
        "network": np.array(["None"], dtype="U256"),
        "station": np.array(["None"], dtype="U256"),
        "landcover": np.float32([np.nan]),
        "climate": np.array(["None"], dtype="U4"),
    }

    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        metadata_dict = [{
            "network": metadata["network"],
            "station": metadata["station"],
            "landcover": metadata["landcover_2010"],
            "climate": metadata["climate"],
        }]
        jobs.append(
            (idx, metadata["longitude"], metadata["latitude"], metadata_dict))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ISMN": {
            "class": ismn_reader,
            "columns": ["soil moisture"],
        },
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            },
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(datasets,
                           "ISMN",
                           period,
                           read_ts_names=read_ts_names)
    process = Validation(
        datasets,
        "ISMN",
        temporal_ref="ASCAT",
        scaling="lin_cdf_match",
        scaling_ref="ASCAT",
        metrics_calculators={
            (2, 2):
            metrics_calculators.BasicMetrics(
                other_name="k1",
                metadata_template=metadata_dict_template).calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(save_path,
                                 "ASCAT.sm_with_ISMN.soil moisture.nc")
    target_vars = {
        "n_obs": [357, 384, 1646, 1875, 1915, 467, 141, 251],
        "rho":
        np.array([
            0.53934574,
            0.7002289,
            0.62200236,
            0.53647155,
            0.30413666,
            0.6740655,
            0.8418981,
            0.74206454,
        ],
                 dtype=np.float32),
        "RMSD":
        np.array([
            11.583476,
            7.729667,
            17.441547,
            21.125721,
            14.31557,
            14.187225,
            13.0622425,
            12.903898,
        ],
                 dtype=np.float32),
        "network":
        np.array(
            [
                "MAQU",
                "MAQU",
                "SCAN",
                "SCAN",
                "SCAN",
                "SOILSCAPE",
                "SOILSCAPE",
                "SOILSCAPE",
            ],
            dtype="U256",
        )
    }
    vars_should = [
        'BIAS', 'R', 'RMSD', '_row_size', 'climate', 'gpi', 'idx', 'landcover',
        'lat', 'lon', 'n_obs', 'network', 'p_R', 'p_rho', 'p_tau', 'rho',
        'station', 'tau', 'time'
    ]

    check_results(filename=results_fname,
                  target_vars=target_vars,
                  variables=vars_should)
示例#25
0
def test_TripleCollocationMetrics(testdata_generator):
    # tests by comparison of pairwise metrics to triplet metrics

    datasets, expected = testdata_generator()

    refname = "reference_name"
    othernames = list(datasets.keys())
    othernames.remove(refname)

    triplet_metrics_calculator = TripleCollocationMetrics(
        refname, bootstrap_cis=False
    )

    matcher = make_combined_temporal_matcher(pd.Timedelta(6, "H"))

    val_triplet = Validation(
        datasets,
        "reference_name",
        scaling=None,  # doesn't work with the constant test data
        temporal_matcher=matcher,
        metrics_calculators={
            (4, 3): triplet_metrics_calculator.calc_metrics
        }
    )
    results_triplet = val_triplet.calc(
        [1], [1], [1], rename_cols=False, only_with_temporal_ref=True
    )

    if "col1_name" in datasets.keys():
        # we only test the TCA results with the random data, since for the
        # constant data all covariances are zero and TCA therefore doesn't
        # work.
        for metric in ["snr", "err_std", "beta"]:
            for dset in datasets:
                values = []
                dkey = (dset, datasets[dset]["columns"][0])
                for tkey in results_triplet:
                    if dkey in tkey:
                        values.append(results_triplet[tkey][(metric, dset)][0])
                diff = np.abs(np.diff(values))
                assert diff.max() / values[0] < 0.1

    # check if writing to file works
    results_path = Path("__test_results")
    # if this throws, there's either some data left over from previous tests,
    # or some data is named __test_results. Remove the __test_results directory
    # from your current directory to make the test work again.
    assert not results_path.exists()
    results_path.mkdir(exist_ok=True, parents=True)
    netcdf_results_manager(results_triplet, results_path.name)
    assert results_path.exists()
    for key in results_triplet:
        fname = "_with_".join(map(lambda t: ".".join(t), key)) + ".nc"
        assert (results_path / fname).exists()
        # res = xr.open_dataset(results_path / fname)
        # for metric in ["snr", "err_std", "beta"]:
        #     for dset, _ in key:
        #         mkey = metric + "__" + dset
        #         assert mkey in res.data_vars
    shutil.rmtree(results_path)

    # now with CIs, again only for random data
    if "col1_name" in datasets.keys():
        triplet_metrics_calculator = TripleCollocationMetrics(
            refname, bootstrap_cis=True
        )
        val_triplet = Validation(
            datasets,
            "reference_name",
            scaling=None,  # doesn't work with the constant test data
            temporal_matcher=matcher,
            metrics_calculators={
                (4, 3): triplet_metrics_calculator.calc_metrics
            }
        )
        results_triplet = val_triplet.calc(
            [1], [1], [1], rename_cols=False, only_with_temporal_ref=True
        )
        for key in results_triplet:
            for dset, _ in key:
                for metric in ["snr", "err_std", "beta"]:
                    lkey = f"{metric}_ci_lower"
                    ukey = f"{metric}_ci_upper"
                    assert (lkey, dset) in results_triplet[key]
                    assert (ukey, dset) in results_triplet[key]
                    assert (
                        results_triplet[key][(lkey, dset)]
                        <= results_triplet[key][(metric, dset)]
                    )
                    assert (
                        results_triplet[key][(metric, dset)]
                        <= results_triplet[key][(ukey, dset)]
                    )
示例#26
0
def test_validation_with_averager(ascat_reader, ismn_reader):
    """
    Test processing framework with averaging module. ASCAT and ISMN data are used here with no geographical
    considerations (the lut is provided more upstream and contains this information already)
    """
    while hasattr(ascat_reader, 'cls'):
        ascat_reader = ascat_reader.cls
    # lookup table between the ascat and ismn points - not geographically correct
    upscaling_lut = {
        "ISMN": {
            1814367: [(0, 102.1333, 33.8833), (1, 102.1333, 33.6666)],
            1803695: [(2, -86.55, 34.783), (3, -97.083, 37.133),
                      (4, -105.417, 34.25)],
            1856312: [(5, -120.9675, 38.43003), (6, -120.78559, 38.14956),
                      (7, -120.80639, 38.17353)]
        }
    }
    gpis = (1814367, 1803695, 1856312)
    lons, lats = [], []
    for gpi in gpis:
        lon, lat = ascat_reader.grid.gpi2lonlat(gpi)
        lons.append(lon)
        lats.append(lat)

    jobs = [(gpis, lons, lats)]

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            }
        },
        "ISMN": {
            "class": ismn_reader,
            "columns": ["soil moisture"],
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(
        datasets,
        "ASCAT",
        period,
        read_ts_names=read_ts_names,
        upscale_parms={
            "upscaling_method": "average",
            "temporal_stability": True,
            "upscaling_lut": upscaling_lut,
        },
    )
    process = Validation(
        datasets,
        "ASCAT",
        temporal_ref="ISMN",
        scaling="lin_cdf_match",
        scaling_ref="ISMN",
        metrics_calculators={
            (2, 2):
            metrics_calculators.BasicMetrics(other_name="k1").calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(save_path,
                                 "ASCAT.sm_with_ISMN.soil moisture.nc")

    target_vars = {
        "n_obs": [764, 2392, 904],
        "rho": np.array([-0.012487, 0.255156, 0.635517], dtype=np.float32),
        "RMSD": np.array([0.056428, 0.056508, 0.116294], dtype=np.float32),
        "R": np.array([-0.012335, 0.257671, 0.657239], dtype=np.float32)
    }

    check_results(
        filename=results_fname,
        target_vars=target_vars,
    )
示例#27
0
def test_ascat_ismn_validation(ascat_reader):
    """
    Test processing framework with some ISMN and ASCAT sample data
    """

    # Initialize ISMN reader

    ismn_data_folder = os.path.join(
        os.path.dirname(__file__),
        "..",
        "test-data",
        "ismn",
        "multinetwork",
        "header_values",
    )
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(
        variable="soil moisture", min_depth=0, max_depth=0.1
    )
    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        jobs.append((idx, metadata["longitude"], metadata["latitude"]))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        "ISMN": {"class": ismn_reader, "columns": ["soil moisture"]},
        "ASCAT": {
            "class": ascat_reader,
            "columns": ["sm"],
            "kwargs": {
                "mask_frozen_prob": 80,
                "mask_snow_prob": 80,
                "mask_ssf": True,
            },
        },
    }

    read_ts_names = {"ASCAT": "read", "ISMN": "read_ts"}
    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    datasets = DataManager(
        datasets, "ISMN", period, read_ts_names=read_ts_names
    )

    process = Validation(
        datasets,
        "ISMN",
        temporal_ref="ASCAT",
        scaling="lin_cdf_match",
        scaling_ref="ASCAT",
        metrics_calculators={
            (2, 2): metrics_calculators.BasicMetrics(
                other_name="k1"
            ).calc_metrics
        },
        period=period,
    )

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(
        save_path, "ASCAT.sm_with_ISMN.soil moisture.nc"
    )

    vars_should = [
        u"n_obs",
        u"tau",
        u"gpi",
        u"RMSD",
        u"lon",
        u"p_tau",
        u"BIAS",
        u"p_rho",
        u"rho",
        u"lat",
        u"R",
        u"p_R",
        u"time",
        u"idx",
        u"_row_size",
    ]
    n_obs_should = [357, 384, 1646, 1875, 1915, 467, 141, 251]
    rho_should = np.array(
        [
            0.53934574,
            0.7002289,
            0.62200236,
            0.53647155,
            0.30413666,
            0.6740655,
            0.8418981,
            0.74206454,
        ],
        dtype=np.float32,
    )
    rmsd_should = np.array(
        [
            11.583476,
            7.729667,
            17.441547,
            21.125721,
            14.31557,
            14.187225,
            13.0622425,
            12.903898,
        ],
        dtype=np.float32,
    )

    with nc.Dataset(results_fname, mode="r") as results:
        vars = results.variables.keys()
        n_obs = results.variables["n_obs"][:].tolist()
        rho = results.variables["rho"][:]
        rmsd = results.variables["RMSD"][:]

    assert sorted(vars) == sorted(vars_should)
    assert sorted(n_obs) == sorted(n_obs_should)
    nptest.assert_allclose(sorted(rho), sorted(rho_should), rtol=1e-4)
    nptest.assert_allclose(sorted(rmsd), sorted(rmsd_should), rtol=1e-4)
示例#28
0
def test_ascat_ismn_validation():
    """
    Test processing framework with some ISMN and ASCAT sample data
    """
    ascat_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data',
                                     'sat', 'ascat', 'netcdf', '55R22')

    ascat_grid_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data',
                                     'sat', 'ascat', 'netcdf', 'grid')

    static_layers_folder = os.path.join(os.path.dirname(__file__),
                                        '..', 'test-data', 'sat',
                                        'h_saf', 'static_layer')

    ascat_reader = AscatSsmCdr(ascat_data_folder, ascat_grid_folder,
                               grid_filename='TUW_WARP5_grid_info_2_1.nc',
                               static_layer_path=static_layers_folder)
    ascat_reader.read_bulk = True

    # Initialize ISMN reader

    ismn_data_folder = os.path.join(os.path.dirname(__file__), '..', 'test-data',
                                    'ismn', 'multinetwork', 'header_values')
    ismn_reader = ISMN_Interface(ismn_data_folder)

    jobs = []

    ids = ismn_reader.get_dataset_ids(
        variable='soil moisture',
        min_depth=0,
        max_depth=0.1)
    for idx in ids:
        metadata = ismn_reader.metadata[idx]
        jobs.append((idx, metadata['longitude'], metadata['latitude']))

    # Create the variable ***save_path*** which is a string representing the
    # path where the results will be saved. **DO NOT CHANGE** the name
    # ***save_path*** because it will be searched during the parallel
    # processing!

    save_path = tempfile.mkdtemp()

    # Create the validation object.

    datasets = {
        'ISMN': {
            'class': ismn_reader,
            'columns': ['soil moisture']
        },
        'ASCAT': {
            'class': ascat_reader,
            'columns': ['sm'],
            'kwargs': {'mask_frozen_prob': 80,
                       'mask_snow_prob': 80,
                       'mask_ssf': True}
        }}

    period = [datetime(2007, 1, 1), datetime(2014, 12, 31)]

    process = Validation(
        datasets, 'ISMN',
        temporal_ref='ASCAT',
        scaling='lin_cdf_match',
        scaling_ref='ASCAT',
        metrics_calculators={
            (2, 2): metrics_calculators.BasicMetrics(other_name='k1').calc_metrics},
        period=period)

    for job in jobs:
        results = process.calc(*job)
        netcdf_results_manager(results, save_path)

    results_fname = os.path.join(
        save_path, 'ASCAT.sm_with_ISMN.soil moisture.nc')

    vars_should = [u'n_obs', u'tau', u'gpi', u'RMSD', u'lon', u'p_tau',
                   u'BIAS', u'p_rho', u'rho', u'lat', u'R', u'p_R']
    n_obs_should = [384,  357,  482,  141,  251, 1927, 1887, 1652]
    rho_should = np.array([0.70022893, 0.53934574,
                           0.69356072, 0.84189808,
                           0.74206454, 0.30299741,
                           0.53143877, 0.62204134], dtype=np.float32)

    rmsd_should = np.array([7.72966719, 11.58347607,
                            14.57700157, 13.06224251,
                            12.90389824, 14.24668026,
                            21.19682884, 17.3883934], dtype=np.float32)
    with nc.Dataset(results_fname, mode='r') as results:
        assert sorted(results.variables.keys()) == sorted(vars_should)
        assert sorted(results.variables['n_obs'][:].tolist()) == sorted(
            n_obs_should)
        nptest.assert_allclose(sorted(rho_should),
                               sorted(results.variables['rho'][:]),
                               rtol=1e-4)
        nptest.assert_allclose(sorted(rmsd_should),
                               sorted(results.variables['RMSD'][:]),
                               rtol=1e-4)