Exemplo n.º 1
0
def Tb_evaluation():

    result_file = r'D:\work\LDAS\2018-06_rmse_uncertainty\Tb_evaluation\validation.csv'

    DA_const_err = LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scaled')
    DA_varia_err = LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scl_errfile')

    ismn = ISMN_io(col_offs=DA_const_err.grid.tilegrids.loc['domain',
                                                            'i_offg'],
                   row_offs=DA_const_err.grid.tilegrids.loc['domain',
                                                            'j_offg'])

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()):

        logging.info('%i/%i' % (i, len(ismn.list)))

        res = pd.DataFrame(meta.copy()).transpose()
        col = meta.ease_col
        row = meta.ease_row

        for io, mode in zip([DA_const_err, DA_varia_err],
                            ['const_err', 'varia_err']):
            ubRMSD = np.sqrt(
                (((io.timeseries['obs_obs'][:, row, col, :] -
                   io.timeseries['obs_obs'][:, row, col, :].mean()) -
                  (io.timeseries['obs_fcst'][:, row, col, :] -
                   io.timeseries['obs_fcst'][:, row, col, :].mean()))**2
                 ).mean().values)
            ensstd = np.sqrt(io.timeseries['obs_anavar'][:, row,
                                                         col, :].mean()).values
            res['ubrmsd_' + mode] = ubRMSD
            res['ensstd_' + mode] = ensstd

        if (os.path.isfile(result_file) == False):
            res.to_csv(result_file, float_format='%0.4f')
        else:
            res.to_csv(result_file,
                       float_format='%0.4f',
                       mode='a',
                       header=False)
Exemplo n.º 2
0
def plot_cat_timeseries():

    outpath = r'D:\work\LDAS\2018-02_scaling\_new\ismn_eval\timeseries'

    fname = r"D:\work\LDAS\2018-02_scaling\_new\ismn_eval\validation.csv"
    res = pd.read_csv(fname)

    diff_srf = res['corr_DA_cal_pent_ma_sm_surface'] - res[
        'corr_DA_uncal_pent_ma_sm_surface']
    diff_rz = res['corr_DA_cal_pent_ma_sm_rootzone'] - res[
        'corr_DA_uncal_pent_ma_sm_rootzone']
    diff_prof = res['corr_DA_cal_pent_ma_sm_profile'] - res[
        'corr_DA_uncal_pent_ma_sm_profile']
    ind = (diff_srf > 0.2) | (diff_rz > 0.2) | (diff_prof > 0.2)
    res = res.loc[ind, ['network', 'station', 'lat', 'lon']]

    ismn = ISMN_io()
    cal = LDAS_io('xhourly', 'US_M36_SMOS_DA_calibrated_scaled')
    uncal = LDAS_io('xhourly', 'US_M36_SMOS_DA_nocal_scaled_pentadal')

    variables = ['sm_surface', 'sm_rootzone', 'sm_profile']

    for idx, stat in res.iterrows():

        fname = os.path.join(outpath,
                             stat.network + '_' + stat.station + '.png')

        ts_ismn = ismn.read(stat.network, stat.station)
        lat = stat.lat
        lon = stat.lon

        plt.figure(figsize=(17, 9))

        for i, var in enumerate(variables):

            ax = plt.subplot(3, 1, i + 1)

            ts_cal = calc_anomaly(cal.read_ts(var, lon, lat), method='ma')
            ts_cal.index += pd.to_timedelta('2 hours')
            ts_uncal = calc_anomaly(uncal.read_ts(var, lon, lat), method='ma')
            ts_uncal.index += pd.to_timedelta('2 hours')

            df = pd.DataFrame({
                'cal': ts_cal,
                'uncal': ts_uncal,
                'insitu': calc_anomaly(ts_ismn[var], method='ma')
            }).dropna()
            if len(df) > 0:
                df.plot(ax=ax)
            else:
                continue

            title = 'R(ismn - cal) = %.2f , R(ismn - uncal) = %.2f' % (
                df.corr().loc['insitu', 'cal'], df.corr().loc['insitu',
                                                              'uncal'])

            ax.set_title(title, fontsize=12)
            ax.set_xlim('2010-01-01', '2016-01-01')
            ax.set_ylim(-0.3, 0.3)
            ax.set_xlabel('')

        plt.tight_layout()

        plt.savefig(fname, dpi=150)
        plt.close()
Exemplo n.º 3
0
def run(part):

    parts = 6

    result_file = r'D:\work\ESA_CCI_SM\validation_%i.csv' % part

    cci = CCISM_io()
    ismn = ISMN_io()

    # ismn.list = ismn.list.iloc[100:120]

    # Split station list in 4 parts for parallelization
    subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int')
    subs[-1] = len(ismn.list)
    start = subs[part - 1]
    end = subs[part]
    ismn.list = ismn.list.iloc[start:end, :]

    periods = {
        'p1': ['2007-10-01', '2010-01-14'],
        'p2': ['2010-01-15', '2011-10-04'],
        'p3': ['2011-10-05', '2012-06-30'],
        'p4': ['2012-07-01', '2014-12-31']
    }

    freq = ['abs', 'anom']

    corr_tags = [
        'corr_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes
        for v in cci.versions for p in periods.keys() for f in freq
    ]
    p_tags = [
        'p_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes
        for v in cci.versions for p in periods.keys() for f in freq
    ]
    n_tags = [
        'n_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes
        for v in cci.versions for p in periods.keys() for f in freq
    ]

    res = ismn.list.copy()
    res.drop(['ease_col', 'ease_row'], axis='columns', inplace=True)
    for col in corr_tags + p_tags:
        res[col] = np.nan
    for col in n_tags:
        res[col] = 0

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()):
        print('%i/%i (Proc %i)' % (i, len(ismn.list), part))

        if ts_insitu is None:
            print('No in situ data for ' + meta.network + ' / ' + meta.station)
            continue
        ts_insitu = ts_insitu[periods['p1'][0]:periods['p4'][1]]
        if len(ts_insitu) < 10:
            print('No in situ data for ' + meta.network + ' / ' + meta.station)
            continue
        df_insitu = pd.DataFrame(ts_insitu).dropna()
        df_insitu_anom = pd.DataFrame(calc_anomaly(ts_insitu)).dropna()

        for m in cci.modes:
            df_cci = cci.read(meta.lon, meta.lat, mode=m).dropna()
            if len(df_cci) < 10:
                print('No CCI ' + m + ' data for ' + meta.network + ' / ' +
                      meta.station)
                continue

            for f in freq:
                if f == 'abs':
                    matched = df_match(df_cci, df_insitu, window=0.5)
                else:
                    for v in cci.versions:
                        df_cci.loc[:, m + '_' + v] = calc_anomaly(
                            df_cci[m + '_' + v])
                    df_cci.dropna(inplace=True)
                    if (len(df_cci) < 10) | (len(df_insitu_anom) < 10):
                        print('No in situ or CCI ' + m + ' anomaly data for ' +
                              meta.network + ' / ' + meta.station)
                        continue
                    matched = df_match(df_cci, df_insitu_anom, window=0.5)

                data = df_cci.join(matched['insitu']).dropna()

                for p in periods.keys():
                    vals = data[periods[p][0]:periods[p][1]].values

                    n_matches = vals.shape[0]
                    if n_matches < 10:
                        continue
                    for k, v in enumerate(cci.versions):
                        corr, p_value = pearsonr(vals[:, k], vals[:, -1])
                        res.loc[meta.name, 'corr_' + m + '_' + v + '_' + p +
                                '_' + f] = corr
                        res.loc[meta.name, 'p_' + m + '_' + v + '_' + p + '_' +
                                f] = p_value
                        res.loc[meta.name, 'n_' + m + '_' + v + '_' + p + '_' +
                                f] = n_matches

    res.to_csv(result_file, float_format='%0.4f')
def EC_ascat_smap_ismn_ldas():

    result_file = Path('/Users/u0116961/Documents/work/extended_collocation/ec_ascat_smap_ismn_ldas.csv')

    names = ['insitu', 'ascat', 'smap', 'ol', 'da']
    combs = list(combinations(names, 2))

    ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_noScl').timeseries
    ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries
    ds_da_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries['obs_ana']
    tg = LDAS_io().grid.tilegrids

    modes = ['absolute','longterm','shortterm']

    ismn = ISMN_io()
    ismn.list = ismn.list.iloc[70::]
    ascat = HSAF_io()
    smap = SMAP_io()

    lut = pd.read_csv(Paths().lut, index_col=0)

    i = 0
    for meta, ts_insitu in ismn.iter_stations(surface_only=True):
        i += 1
        logging.info('%i/%i' % (i, len(ismn.list)))

        try:
            if len(ts_insitu := ts_insitu['2015-04-01':'2020-04-01'].resample('1d').mean().dropna()) < 25:
                continue
        except:
            continue

        res = pd.DataFrame(meta.copy()).transpose()
        col = meta.ease_col
        row = meta.ease_row

        colg = col + tg.loc['domain', 'i_offg']  # col / lon
        rowg = row + tg.loc['domain', 'j_offg']  # row / lat

        tmp_lut = lut[(lut.ease2_col == colg) & (lut.ease2_row == rowg)]
        if len(tmp_lut) == 0:
            continue

        gpi_smap = tmp_lut.index.values[0]
        gpi_ascat = tmp_lut.ascat_gpi.values[0]

        try:
            ts_ascat = ascat.read(gpi_ascat, resample_time=False).resample('1d').mean().dropna()
            ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')]
            ts_ascat.name = 'ASCAT'
        except:
            continue

        ts_smap = smap.read(gpi_smap)

        if (ts_ascat is None) | (ts_smap is None):
            continue

        ind = (ds_ol['snow_mass'][:, row, col].values == 0)&(ds_ol['soil_temp_layer1'][:, row, col].values > 277.15)
        ts_ol = ds_ol['sm_surface'][:, row, col].to_series().loc[ind].dropna()
        ts_ol.index += pd.to_timedelta('2 hours')

        ind = (ds_da['snow_mass'][:, row, col].values == 0)&(ds_da['soil_temp_layer1'][:, row, col].values > 277.15)
        ts_da = ds_da['sm_surface'][:, row, col].to_series().loc[ind].dropna()
        ts_da.index += pd.to_timedelta('2 hours')

        for mode in modes:

            if mode == 'absolute':
                ts_ins = ts_insitu.copy()
                ts_asc = ts_ascat.copy()
                ts_smp = ts_smap.copy()
                ts_ol = ts_ol.copy()
                ts_da = ts_da.copy()
            else:
                ts_ins = calc_anom(ts_ins.copy(), longterm=(mode=='longterm')).dropna()
                ts_asc = calc_anom(ts_asc.copy(), longterm=(mode == 'longterm')).dropna()
                ts_smp = calc_anom(ts_smp.copy(), longterm=(mode == 'longterm')).dropna()
                ts_ol = calc_anom(ts_ol.copy(), longterm=(mode == 'longterm')).dropna()
                ts_da = calc_anom(ts_da.copy(), longterm=(mode == 'longterm')).dropna()

            tmp = pd.DataFrame(dict(zip(names, [ts_ins, ts_asc, ts_smp, ts_ol, ts_da]))).dropna()

            corr = tmp.corr()
            ec_res = ecol(tmp[['insitu', 'ascat', 'smap', 'ol', 'da']], correlated=[['smap', 'ol'], ['smap', 'da'], ['ol', 'da']])

            res[f'len_{mode}'] = len(tmp)
            for c in combs:
                res[f'corr_{"_".join(c)}'] = corr.loc[c]
            res[f'err_corr_smap_ol_{mode}'] = ec_res['err_corr_smap_ol']
            res[f'err_corr_smap_da_{mode}'] = ec_res['err_corr_smap_da']
            res[f'err_corr_ol_da_{mode}'] = ec_res['err_corr_ol_da']

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.4f')
        else:
            res.to_csv(result_file, float_format='%0.4f', mode='a', header=False)
Exemplo n.º 5
0
def TCA_insitu_evaluation():

    result_file = r'D:\work\LDAS\2018-06_rmse_uncertainty\TCA_evaluation\validation.csv'

    noDA = LDAS_io('xhourly', 'US_M36_SMOS40_noDA_cal_scaled')

    DA_const_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scaled')
    DA_varia_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scl_errfile')

    t_ana = pd.DatetimeIndex(
        LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scaled').timeseries.time.
        values).sort_values()

    ascat = HSAF_io()
    gpi_list = pd.read_csv(
        r"D:\data_sets\ASCAT\warp5_grid\pointlist_warp_conus.csv", index_col=0)

    ismn = ISMN_io(col_offs=noDA.grid.tilegrids.loc['domain', 'i_offg'],
                   row_offs=noDA.grid.tilegrids.loc['domain', 'j_offg'])

    runs = ['noDA', 'DA_const_err', 'DA_varia_err']
    tss = [noDA.timeseries, DA_const_err.timeseries, DA_varia_err.timeseries]

    variables = [
        'sm_surface',
    ]
    modes = [
        'absolute',
    ]

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()):
        logging.info('%i/%i' % (i, len(ismn.list)))

        try:

            res = pd.DataFrame(meta.copy()).transpose()
            col = meta.ease_col
            row = meta.ease_row

            gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list)

            ts_asc = ascat.read(gpi, resample_time=False)
            if ts_asc is None:
                continue
            ts_asc.name = 'ascat'
            ts_asc = pd.DataFrame(ts_asc)

            for var in variables:
                for mode in modes:

                    ts_ins = ts_insitu[var].dropna()
                    ts_ins.name = 'insitu'
                    ts_ins = pd.DataFrame(ts_ins)

                    for run, ts_model in zip(runs, tss):

                        ind = (ts_model['snow_mass'][row, col].values == 0) & (
                            ts_model['soil_temp_layer1'][row,
                                                         col].values > 277.15)
                        ts_mod = ts_model[var][row, col].to_series().loc[ind]
                        ts_mod.index += pd.to_timedelta('2 hours')
                        ts_mod = ts_mod.loc[t_ana].dropna()
                        ts_mod.name = 'model'
                        ts_mod = pd.DataFrame(ts_mod)

                        matched = df_match(ts_mod, ts_asc, ts_ins, window=0.5)
                        data = ts_mod.join(matched[0][[
                            'ascat',
                        ]]).join(matched[1][[
                            'insitu',
                        ]]).dropna()

                        tc_res = TCA(data['model'].values,
                                     data['ascat'].values,
                                     data['insitu'].values)

                        res['RMSE_model_' + run + '_' + mode + '_' +
                            var] = tc_res[1][0]
                        res['RMSE_ascat_' + run + '_' + mode + '_' +
                            var] = tc_res[1][1]
                        res['RMSE_insitu_' + run + '_' + mode + '_' +
                            var] = tc_res[1][2]

                        res['beta_ascat_' + run + '_' + mode + '_' +
                            var] = tc_res[2][1]
                        res['beta_insitu_' + run + '_' + mode + '_' +
                            var] = tc_res[2][2]

                        res['len_' + mode + '_' + var] = len(data)

            if (os.path.isfile(result_file) == False):
                res.to_csv(result_file, float_format='%0.4f')
            else:
                res.to_csv(result_file,
                           float_format='%0.4f',
                           mode='a',
                           header=False)

        except:
            continue
Exemplo n.º 6
0
def insitu_evaluation():

    result_file = r'D:\work\LDAS\2018-06_rmse_uncertainty\insitu_evaluation\validation.csv'

    noDA = LDAS_io('xhourly', 'US_M36_SMOS40_noDA_cal_scaled')

    DA_const_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scaled')
    DA_varia_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scl_errfile')

    t_ana = pd.DatetimeIndex(
        LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scaled').timeseries.time.
        values).sort_values()

    ismn = ISMN_io(col_offs=noDA.grid.tilegrids.loc['domain', 'i_offg'],
                   row_offs=noDA.grid.tilegrids.loc['domain', 'j_offg'])

    runs = ['noDA', 'DA_const_err', 'DA_varia_err']
    tss = [noDA.timeseries, DA_const_err.timeseries, DA_varia_err.timeseries]

    variables = ['sm_surface', 'sm_rootzone', 'sm_profile']
    # modes = ['absolute','longterm','shortterm']
    modes = [
        'absolute',
    ]

    # ismn.list = ismn.list.iloc[101::]

    i = 0
    for meta, ts_insitu in ismn.iter_stations():
        i += 1
        logging.info('%i/%i' % (i, len(ismn.list)))

        res = pd.DataFrame(meta.copy()).transpose()
        col = meta.ease_col
        row = meta.ease_row

        for var in variables:
            for mode in modes:

                if mode == 'absolute':
                    ts_ref = ts_insitu[var].dropna()
                elif mode == 'mean':
                    ts_ref = calc_anomaly(ts_insitu[var], mode).dropna()
                else:
                    ts_ref = calc_anomaly(
                        ts_insitu[var],
                        method='moving_average',
                        longterm=(mode == 'longterm')).dropna()

                for run, ts_model in zip(runs, tss):

                    ind = (ts_model['snow_mass'][row, col].values == 0) & (
                        ts_model['soil_temp_layer1'][row, col].values > 277.15)
                    ts_mod = ts_model[var][row, col].to_series().loc[ind]
                    ts_mod.index += pd.to_timedelta('2 hours')
                    # TODO: Make sure that time of netcdf file is correct!!

                    if mode == 'absolute':
                        ts_mod = ts_mod.dropna()
                    else:
                        ts_mod = calc_anomaly(
                            ts_mod,
                            method='moving_average',
                            longterm=mode == 'longterm').dropna()

                    tmp = pd.DataFrame({
                        1: ts_ref,
                        2: ts_mod
                    }).loc[t_ana, :].dropna()
                    res['len_' + mode + '_' + var] = len(tmp)

                    r, p = pearsonr(tmp[1], tmp[2])

                    res['corr_' + run + '_' + mode + '_' +
                        var] = r if (r > 0) & (p < 0.01) else np.nan
                    res['rmsd_' + run + '_' + mode + '_' + var] = np.sqrt(
                        ((tmp[1] - tmp[2])**2).mean())
                    res['ubrmsd_' + run + '_' + mode + '_' + var] = np.sqrt(
                        (((tmp[1] - tmp[1].mean()) -
                          (tmp[2] - tmp[2].mean()))**2).mean())

        if (os.path.isfile(result_file) == False):
            res.to_csv(result_file, float_format='%0.4f')
        else:
            res.to_csv(result_file,
                       float_format='%0.4f',
                       mode='a',
                       header=False)
Exemplo n.º 7
0
def run_ismn_eval():

    experiments = [['SMOSSMAP', 'short']]

    names = ['open_loop'] + ['MadKF_SMOS40'
                             ] + ['_'.join(exp) for exp in experiments]
    runs = ['US_M36_SMAP_TB_OL_noScl'] + [
        'US_M36_SMOS40_TB_MadKF_DA_it613'
    ] + [f'US_M36_SMAP_TB_DA_scl_{name}' for name in names[2::]]

    dss = [LDAS_io('xhourly', run).timeseries for run in runs]

    result_file = Path(
        '/Users/u0116961/Documents/work/LDAS/2020-03_scaling/validation/ismn_eval.csv'
    )
    t_ana = pd.DatetimeIndex(
        LDAS_io('ObsFcstAna', runs[0]).timeseries.time.values).sort_values()

    variables = ['sm_surface', 'sm_rootzone', 'sm_profile']
    modes = ['absolute', 'longterm', 'shortterm']

    ismn = ISMN_io()
    ismn.list = ismn.list.iloc[70::]

    i = 0
    for meta, ts_insitu in ismn.iter_stations(surface_only=False):
        i += 1
        logging.info('%i/%i' % (i, len(ismn.list)))

        if len(ts_insitu := ts_insitu['2015-04-01':'2020-04-01']) < 50:
            continue

        res = pd.DataFrame(meta.copy()).transpose()
        col = meta.ease_col
        row = meta.ease_row

        for var in variables:
            for mode in modes:

                if mode == 'absolute':
                    ts_ref = ts_insitu[var].dropna()
                else:
                    ts_ref = calc_anom(ts_insitu[var],
                                       longterm=(mode == 'longterm')).dropna()

                for run, ts_model in zip(names, dss):

                    ind = (ts_model['snow_mass'][:, row, col].values == 0) & (
                        ts_model['soil_temp_layer1'][:, row,
                                                     col].values > 277.15)
                    ts_mod = ts_model[var][:, row, col].to_series().loc[ind]
                    ts_mod.index += pd.to_timedelta('2 hours')

                    if mode == 'absolute':
                        ts_mod = ts_mod.dropna()
                    else:
                        ts_mod = calc_anom(
                            ts_mod, longterm=mode == 'longterm').dropna()

                    tmp = pd.DataFrame({1: ts_ref, 2: ts_mod}).dropna()
                    res['len_' + mode + '_' + var] = len(tmp)
                    r, p = pearsonr(
                        tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan)
                    res['r_' + run + '_' + mode + '_' + var] = r
                    # res['p_' + run +'_' + mode + '_' + var] = p
                    # res['rmsd_' + run +'_' + mode + '_' + var] = np.sqrt(((tmp[1]-tmp[2])**2).mean())
                    res['ubrmsd_' + run + '_' + mode + '_' + var] = np.sqrt(
                        (((tmp[1] - tmp[1].mean()) -
                          (tmp[2] - tmp[2].mean()))**2).mean())

                    tmp = pd.DataFrame({
                        1: ts_ref,
                        2: ts_mod
                    }).reindex(t_ana).dropna()
                    res['ana_len_' + mode + '_' + var] = len(tmp)
                    r, p = pearsonr(
                        tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan)
                    res['ana_r_' + run + '_' + mode + '_' + var] = r
                    # res['ana_p_' + run + '_' + mode + '_' + var] = p
                    # res['ana_rmsd_' + run +'_' + mode + '_' + var] = np.sqrt(((tmp[1]-tmp[2])**2).mean())
                    res['ana_ubrmsd_' + run + '_' + mode + '_' +
                        var] = np.sqrt((((tmp[1] - tmp[1].mean()) -
                                         (tmp[2] - tmp[2].mean()))**2).mean())

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.4f')
        else:
            res.to_csv(result_file,
                       float_format='%0.4f',
                       mode='a',
                       header=False)
Exemplo n.º 8
0
def run(part):

    parts = 15

    smos = SMOS_io()
    ismn = ISMN_io()
    ascat = HSAF_io(ext=None)
    mswep = MSWEP_io()

    # Median Q from MadKF API/CONUS run.
    Q_avg = 12.
    R_avg = 74.

    # Select only SCAN and USCRN
    ismn.list = ismn.list[(ismn.list.network == 'SCAN') |
                          (ismn.list.network == 'USCRN')]
    ismn.list.index = np.arange(len(ismn.list))

    # Split station list in 4 parts for parallelization
    subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int')
    subs[-1] = len(ismn.list)
    start = subs[part - 1]
    end = subs[part]
    ismn.list = ismn.list.iloc[start:end, :]

    if platform.system() == 'Windows':
        result_file = os.path.join('D:', 'work', 'MadKF', 'CONUS', 'ismn_eval',
                                   'result_part%i.csv' % part)
    elif platform.system() == 'Linux':
        result_file = os.path.join('/', 'scratch', 'leuven', '320', 'vsc32046',
                                   'output', 'MadKF', 'CONUS', 'ismn_eval',
                                   'result_part%i.csv' % part)
    else:
        result_file = os.path.join('/', 'work', 'MadKF', 'CONUS', 'ismn_eval',
                                   'parts2', 'result_part%i.csv' % part)

    dt = ['2010-01-01', '2015-12-31']

    for cnt, (station,
              insitu) in enumerate(ismn.iter_stations(surf_depth=0.1)):

        # station = ismn.list.loc[978,:]
        # insitu = ismn.read_first_surface_layer('SCAN','Los_Lunas_Pmc')

        print('%i / %i' % (cnt, len(ismn.list)))

        # if True:
        try:
            gpi = lonlat2gpi(station.lon, station.lat, mswep.grid)
            mswep_idx = mswep.grid.index[mswep.grid.dgg_gpi == gpi][0]
            smos_gpi = mswep.grid.loc[mswep_idx, 'smos_gpi']

            precip = mswep.read(mswep_idx)
            sm_ascat = ascat.read(gpi)
            sm_smos = smos.read(smos_gpi) * 100.

            if (precip is None) | (sm_ascat is None) | (sm_smos is None) | (
                    insitu is None):
                continue

            precip = calc_anomaly(precip[dt[0]:dt[1]],
                                  method='moving_average',
                                  longterm=False)
            sm_ascat = calc_anomaly(sm_ascat[dt[0]:dt[1]],
                                    method='moving_average',
                                    longterm=False)
            sm_smos = calc_anomaly(sm_smos[dt[0]:dt[1]],
                                   method='moving_average',
                                   longterm=False)
            insitu = calc_anomaly(insitu[dt[0]:dt[1]].resample('1d').first(),
                                  method='moving_average',
                                  longterm=False).tz_localize(None)

            df = pd.DataFrame({
                1: precip,
                2: sm_ascat,
                3: sm_smos,
                4: insitu
            },
                              index=pd.date_range(dt[0], dt[1]))
            df.loc[np.isnan(df[1]), 1] = 0.
            n = len(df)

            if len(df.dropna()) < 50:
                continue
            gamma = mswep.grid.loc[mswep_idx, 'gamma']
            api = API(gamma=gamma)

            # --- OL run ---
            x_OL = np.full(n, np.nan)
            model = deepcopy(api)
            for t, f in enumerate(precip.values):
                x = model.step(f)
                x_OL[t] = x

            # ----- Calculate uncertainties -----
            # convert (static) forcing to model uncertainty
            P_avg = Q_avg / (1 - gamma**2)

            # calculate TCA based uncertainty and scaling coefficients
            tmp_df = pd.DataFrame({
                1: x_OL,
                2: sm_ascat,
                3: sm_smos
            },
                                  index=pd.date_range(dt[0], dt[1])).dropna()
            snr, r_tc, err, beta = tc(tmp_df)
            P_TC = err[0]**2
            Q_TC = P_TC * (1 - gamma**2)
            R_TC = (err[1] / beta[1])**2
            H_TC = beta[1]

            # Calculate RMSD based uncertainty
            R_rmsd = (np.nanmean(
                (tmp_df[1].values - H_TC * tmp_df[2].values)**2) - P_avg)
            if R_rmsd < 0:
                R_rmsd *= -1
            # -----------------------------------

            # ----- Run KF using TCA-based uncertainties -----
            api_kf = API(gamma=gamma, Q=Q_TC)
            x_kf, P, R_innov_kf, checkvar_kf, K_kf = \
                KF(api_kf, df[1].values.copy(), df[2].values.copy(), R_TC, H=H_TC)

            # ----- Run EnKF using static uncertainties -----
            forc_pert = ['normal', 'additive', Q_avg]
            obs_pert = ['normal', 'additive', R_avg]
            x_avg, P, R_innov_avg, checkvar_avg, K_avg = \
                EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)

            # ----- Run EnKF using RMSD-based uncertainties (corrected for model uncertainty) -----
            # forc_pert = ['normal', 'additive', Q_avg]
            # obs_pert = ['normal', 'additive', R_rmsd]
            # x_rmsd, P, R_innov_rmsd, checkvar_rmsd, K_rmsd = \
            #     EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)

            # ----- Run MadKF -----
            cnt = 0
            checkvar_madkf = 9999.
            while ((checkvar_madkf < 0.95) |
                   (checkvar_madkf > 1.05)) & (cnt < 5):
                cnt += 1
                tmp_x_madkf, P_madkf, R_madkf, Q_madkf, H_madkf, R_innov_madkf, tmp_checkvar_madkf, K_madkf = \
                    MadKF(api, df[1].values.copy(), df[2].values.copy(), n_ens=100, n_iter=20)
                if abs(1 - tmp_checkvar_madkf) < abs(1 - checkvar_madkf):
                    checkvar_madkf = tmp_checkvar_madkf
                    x_madkf = tmp_x_madkf

            df['x_ol'] = x_OL
            df['x_kf'] = x_kf
            df['x_avg'] = x_avg
            # df['x_rmsd'] = x_rmsd
            df['x_madkf'] = x_madkf

            # tc_ol = tc(df[[4,3,'x_ol']])
            # tc_kf = tc(df[[4,3,'x_kf']])
            # tc_avg = tc(df[[4,3,'x_avg']])
            # tc_rmsd = tc(df[[4,3,'x_rmsd']])
            # tc_madkf = tc(df[[4,3,'x_madkf']])

            ci_l_ol, ci_m_ol, ci_u_ol = bootstrap_tc(df[[4, 3, 'x_ol']])
            ci_l_kf, ci_m_kf, ci_u_kf = bootstrap_tc(df[[4, 3, 'x_kf']])
            ci_l_avg, ci_m_avg, ci_u_avg = bootstrap_tc(df[[4, 3, 'x_avg']])
            # ci_l_rmsd, ci_m_rmsd, ci_u_rmsd = bootstrap_tc(df[[4,3,'x_rmsd']])
            ci_l_madkf, ci_m_madkf, ci_u_madkf = bootstrap_tc(
                df[[4, 3, 'x_madkf']])

            corr = df.dropna().corr()
            n_all = len(df.dropna())

            result = pd.DataFrame(
                {
                    'lon': station.lon,
                    'lat': station.lat,
                    'network': station.network,
                    'station': station.station,
                    'gpi': gpi,
                    'n_all': n_all,
                    'Q_est_madkf': Q_madkf,
                    'R_est_madkf': R_madkf,
                    'corr_ol': corr[4]['x_ol'],
                    'corr_kf': corr[4]['x_kf'],
                    'corr_avg': corr[4]['x_avg'],
                    # 'corr_rmsd': corr[4]['x_rmsd'],
                    'corr_madkf': corr[4]['x_madkf'],
                    # 'snr_ol': tc_ol[0][2],
                    # 'snr_kf': tc_kf[0][2],
                    # 'snr_avg': tc_avg[0][2],
                    # 'snr_rmsd': tc_rmsd[0][2],
                    # 'snr_madkf': tc_madkf[0][2],
                    # 'r_ol': tc_ol[1][2],
                    # 'r_kf': tc_kf[1][2],
                    # 'r_avg': tc_avg[1][2],
                    # 'r_rmsd': tc_rmsd[1][2],
                    # 'r_madkf': tc_madkf[1][2],
                    # 'rmse_kf': tc_kf[2][2],
                    # 'rmse_avg': tc_avg[2][2],
                    # 'rmse_rmsd': tc_rmsd[2][2],
                    # 'rmse_madkf': tc_madkf[2][2],
                    # 'rmse_ol': tc_ol[2][2],
                    'r_ol_l': ci_l_ol,
                    'r_ol_m': ci_m_ol,
                    'r_ol_u': ci_u_ol,
                    'r_kf_l': ci_l_kf,
                    'r_kf_m': ci_m_kf,
                    'r_kf_u': ci_u_kf,
                    'r_avg_l': ci_l_avg,
                    'r_avg_m': ci_m_avg,
                    'r_avg_u': ci_u_avg,
                    # 'r_rmsd_l': ci_l_rmsd,
                    # 'r_rmsd_m': ci_m_rmsd,
                    # 'r_rmsd_u': ci_u_rmsd,
                    'r_madkf_l': ci_l_madkf,
                    'r_madkf_m': ci_m_madkf,
                    'r_madkf_u': ci_u_madkf,
                    'checkvar_kf': checkvar_kf,
                    'checkvar_avg': checkvar_avg,
                    # 'checkvar_rmsd': checkvar_rmsd,
                    'checkvar_madkf': checkvar_madkf,
                    'R_innov_kf': R_innov_kf,
                    'R_innov_avg': R_innov_avg,
                    # 'R_innov_rmsd': R_innov_rmsd,
                    'R_innov_madkf': R_innov_madkf
                },
                index=(station.name, ))

            if (os.path.isfile(result_file) == False):
                result.to_csv(result_file, float_format='%0.4f')
            else:
                result.to_csv(result_file,
                              float_format='%0.4f',
                              mode='a',
                              header=False)
        except:
            print('GPI failed.')
            continue

    ascat.close()
    mswep.close()
Exemplo n.º 9
0
def plot_suspicious_stations(root):

    statlist = pd.read_csv('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/station_list_r_diff.csv', index_col=0)

    rmsd_root = 'US_M36_SMAP_TB_DA_SM_PROXY_'
    rmsd_exps = list(np.sort([x.name.split(rmsd_root)[1] for x in Path('/Users/u0116961/data_sets/LDASsa_runs').glob('*SM_PROXY*')]))

    ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_scaled_4K_obserr').timeseries
    ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries

    ts_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries['obs_obs']
    t_ana = pd.DatetimeIndex(ts_ana.time.values).sort_values()

    ascat = HSAF_io()
    gpi_list = pd.read_csv(ascat.root / 'warp5_grid' / 'pointlist_warp_conus.csv', index_col=0)

    ismn = ISMN_io()

    variables = ['sm_surface', 'sm_rootzone']
    modes = ['absolute', 'longterm', 'shortterm']

    ismn.list.index = ismn.list.network + '_' + ismn.list.station
    ismn.list.reindex(statlist.index)
    ismn.list = ismn.list.reindex(statlist.index)

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations(surface_only=False)):
        if 'tmp_res' in locals():
            if (meta.network in tmp_res) & (meta.station in tmp_res):
                print(f'Skipping {i}')
                continue

        try:
            res = pd.DataFrame(meta.copy()).transpose()
            col = meta.ease_col
            row = meta.ease_row

            gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list)

            ts_ascat = ascat.read(gpi) / 100 * 0.6
            if ts_ascat is None:
                continue

            for mode in modes:
                for var in variables:

                    tmp = statlist[(statlist.network==meta.network)&(statlist.station==meta.station)]
                    dpr = tmp[f'diff_pearsonr2_{mode}_{var}'].values[0]
                    dtr = tmp[f'diff_tcar2_{mode}_{var}'].values[0]

                    if not ((dtr < 0) & (dpr > 0)):
                        continue

                    if mode == 'absolute':
                        ts_asc = ts_ascat.dropna()
                    else:
                        ts_asc = calc_anom(ts_ascat, longterm=(mode == 'longterm')).dropna()
                    ts_asc.name = 'ascat'
                    ts_asc = pd.DataFrame(ts_asc)

                    if mode == 'absolute':
                        ts_ins = ts_insitu[var].dropna()
                    else:
                        ts_ins = calc_anom(ts_insitu[var], longterm=(mode == 'longterm')).dropna()
                    ts_ins.name = 'insitu'
                    ts_ins = pd.DataFrame(ts_ins)

                    ind = (ds_ol['snow_mass'].isel(lat=row, lon=col).values == 0) & \
                          (ds_ol['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15)
                    ts_ol = ds_ol[var].isel(lat=row, lon=col).to_series().loc[ind]
                    ts_ol.index += pd.to_timedelta('2 hours')
                    ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1)
                    if mode == 'absolute':
                        ts_ol = ts_ol.reindex(t_ana[ind_obs]).dropna()
                    else:
                        ts_ol = calc_anom(ts_ol.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna()
                    ts_ol.name = 'open_loop'
                    ts_ol = pd.DataFrame(ts_ol)

                    ind = (ds_da['snow_mass'].isel(lat=row, lon=col).values == 0) & \
                          (ds_da['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15)
                    ts_da = ds_da[var].isel(lat=row, lon=col).to_series().loc[ind]
                    ts_da.index += pd.to_timedelta('2 hours')
                    ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1)
                    if mode == 'absolute':
                        ts_da = ts_da.reindex(t_ana[ind_obs]).dropna()
                    else:
                        ts_da = calc_anom(ts_da.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna()
                    ts_da.name = 'DA_4K'
                    ts_da = pd.DataFrame(ts_da)

                    matched = df_match(ts_ol, ts_da, ts_asc, ts_ins, window=0.5)
                    data = ts_ol.join(matched[0]['DA_4K']).join(matched[1]['ascat']).join(matched[2]['insitu']).dropna()

                    dpr_triplets = data.corr()['DA_4K']['insitu'] - data.corr()['open_loop']['insitu']
                    if dpr_triplets < 0:
                        continue

                    f = plt.figure(figsize=(15, 5))
                    sns.lineplot(data=data[['open_loop', 'DA_4K', 'insitu']], dashes=False, linewidth=1.5, axes=plt.gca())
                    plt.title(f'{meta.network} / {meta.station} ({var}): d(Pearson R2) = {dpr_triplets:.3f} , d(TCA R2) = {dtr:.3f}')

                    fbase = Path('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/timeseries')
                    fname = fbase / f'{mode}_{var}_{meta.network}_{meta.station}.png'
                    f.savefig(fname, dpi=300, bbox_inches='tight')
                    plt.close()

        except:
            continue
Exemplo n.º 10
0
def run(part):

    parts = 6

    result_file = r'D:\work\ESA_CCI_SM\ismn_r2\ismn_r2_part%i.csv' % part

    cci = CCISM_io()
    ismn = ISMN_io()

    # ismn.list = ismn.list.iloc[100:120]

    # Split station list in 4 parts for parallelization
    subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int')
    subs[-1] = len(ismn.list)
    start = subs[part - 1]
    end = subs[part]
    ismn.list = ismn.list.iloc[start:end, :]

    freq = ['abs', 'anom']

    res = ismn.list.copy()
    res.drop(['ease_col', 'ease_row'], axis='columns', inplace=True)
    res['r_abs'] = np.nan
    res['r_anom'] = np.nan

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()):
        print('%i/%i (Proc %i)' % (i, len(ismn.list), part))

        if ts_insitu is None:
            print('No in situ data for ' + meta.network + ' / ' + meta.station)
            continue
        ts_insitu = ts_insitu['2007-10-01':'2014-12-31']
        if len(ts_insitu) < 10:
            print('No in situ data for ' + meta.network + ' / ' + meta.station)
            continue
        df_insitu = pd.DataFrame(ts_insitu).dropna()
        df_insitu_anom = pd.DataFrame(calc_anomaly(ts_insitu)).dropna()

        df_cci = cci.read(meta.lon,
                          meta.lat,
                          version='v04.4',
                          mode=['ACTIVE', 'PASSIVE']).dropna()
        if len(df_cci) < 10:
            print('No CCI data for ' + meta.network + ' / ' + meta.station)
            continue

        for f in freq:
            if f == 'abs':
                matched = df_match(df_cci, df_insitu, window=0.5)
            else:
                df_cci.loc[:, 'ACTIVE_v04.4'] = calc_anomaly(
                    df_cci['ACTIVE_v04.4'])
                df_cci.loc[:, 'PASSIVE_v04.4'] = calc_anomaly(
                    df_cci['PASSIVE_v04.4'])
                df_cci.dropna(inplace=True)
                if (len(df_cci) < 10) | (len(df_insitu_anom) < 10):
                    print('No in situ or CCI anomaly data for ' +
                          meta.network + ' / ' + meta.station)
                    continue
                matched = df_match(df_cci, df_insitu_anom, window=0.5)

            data = df_cci.join(matched['insitu']).dropna()

            if len(data) < 100:
                continue

            vals = data[['insitu', 'ACTIVE_v04.4']].values
            c1, p1 = pearsonr(vals[:, 0], vals[:, 1])
            vals = data[['insitu', 'PASSIVE_v04.4']].values
            c2, p2 = pearsonr(vals[:, 0], vals[:, 1])
            vals = data[['ACTIVE_v04.4', 'PASSIVE_v04.4']].values
            c3, p3 = pearsonr(vals[:, 0], vals[:, 1])

            if (c1 < 0) | (c2 < 0) | (c3 < 0) | (p1 > 0.05) | (p2 > 0.05) | (
                    p3 > 0.05):
                continue

            res.loc[meta.name, 'r_' + f] = np.sqrt(tc(data)[1][2])

    res.to_csv(result_file, float_format='%0.4f')