Пример #1
0
        def handle_fit_model_dates(dates_RV, dates_all, RV_ts,
                                   fit_model_dates):
            if fit_model_dates is None:
                # RV_ts and RV_ts_fit are equal if fit_model_dates = None
                bool_mask = [
                    True if d in dates_RV else False for d in dates_all
                ]
                fit_model_mask = pd.DataFrame(bool_mask,
                                              columns=['fit_model_mask'],
                                              index=dates_all)
                RV_ts_fit = RV_ts
                fit_dates = dates_RV
            else:
                startperiod, endperiod = fit_model_dates
                startyr = dates_all[0].year
                endyr = dates_all[-1].year
                #                if dates_all.resolution == 'day':
                #                    tfreq = (dates_all[1] - dates_all[0]).days
                start_end_date = (startperiod, endperiod)
                start_end_year = (startyr, endyr)
                fit_dates = core_pp.get_subdates(dates_all,
                                                 start_end_date=start_end_date,
                                                 start_end_year=start_end_year)
                bool_mask = [
                    True if d in fit_dates else False for d in dates_all
                ]
                fit_model_mask = pd.DataFrame(bool_mask,
                                              columns=['fit_model_mask'],
                                              index=dates_all)

                RV_ts_fit = fullts[fit_model_mask.values]
                fit_dates = fit_dates
            return fit_model_mask, fit_dates, RV_ts_fit
Пример #2
0
def start_end_date_mean(df_data, start_end_date):

    # create mask to aggregate
    if hasattr(df_data.index, 'levels'):
        pd_dates = df_data.loc[0].index
    else:
        pd_dates = df_data.index
    subset_dates = core_pp.get_subdates(pd_dates, start_end_date)
    dates_to_aggr_mask = pd.Series(np.repeat(False, pd_dates.size),
                                   index=pd_dates)
    dates_to_aggr_mask.loc[subset_dates] = True
    if hasattr(df_data.index, 'levels'):
        years = df_data.loc[0][dates_to_aggr_mask].index.year
    else:
        years = df_data[dates_to_aggr_mask].index.year
    index = [
        functions_pp.get_oneyr(subset_dates, yr).mean()
        for yr in np.unique(years)
    ]

    if hasattr(df_data.index, 'levels'):
        splits = df_data.index.levels[0]
        df_data_s = np.zeros((splits.size), dtype=object)
        for s in splits:
            df_s = df_data.loc[s]
            df_s = df_s[dates_to_aggr_mask].groupby(years).mean()
            df_s.index = pd.to_datetime(index)
            df_data_s[s] = df_s
        df_data_resample = pd.concat(list(df_data_s), keys=range(splits.size))
    else:
        df_data_resample = df_data[dates_to_aggr_mask].groupby(years).mean()
        df_data_resample.index = pd.to_datetime(index)
    return df_data_resample
Пример #3
0
 def _redefine_RV_mask(self, start_end_TVdate):
     self.df_data = self.df_data.copy()
     self.start_end_TVdate_orig = fcev._get_start_end_TVdate(self)
     self.start_end_TVdate = start_end_TVdate
     RV_mask_orig = self.df_data['RV_mask'].copy()
     dates_RV = core_pp.get_subdates(self.dates_df,
                                     start_end_TVdate,
                                     start_end_year=None)
     new_RVmask = RV_mask_orig.loc[0].copy()
     new_RVmask.loc[:] = False
     new_RVmask.loc[dates_RV] = True
     self.df_data['RV_mask'] = pd.concat([new_RVmask] * self.splits.size,
                                         keys=self.splits)
Пример #4
0
    def select_period(self,
                      df,
                      targ_var_mask,
                      start_date,
                      end_date,
                      start_end_year,
                      leap_year,
                      rename=False):

        dates_full_origin = df.loc[0].index
        dates_target_var_origin = df.loc[0].index[df.loc[0]['RV_mask'] == True]
        df_resample = self.resample(df=df)
        df_period = get_subdates(dates_target_var_origin, start_date, end_date,
                                 start_end_year, leap_year)

        if rename:
            df_period = df_period.rename(rename, axis=1)
            return df_period

        return df_period
Пример #5
0
              'hspace':.2, 'cbar_vert':.05,
              'clevels':np.arange(-.5, .51, .1)}
plot_maps.plot_corr_maps(xr_snap, row_dim='lag', col_dim='split',
                         **kwrgs_plot)
plt.savefig(os.path.join(rg.path_outsub1, f'snapshots_{var}_rm{rm}.pdf'))
#%% Correlation PNA-like RW with Wavenumber 6 phase 2 # only for eastern
import core_pp, find_precursors
values = []
if west_or_east == 'eastern':
    lags_list = range(-10,10)
    for lag in lags_list:
        selbox = (0,360,25,60)
        # selbox = (140,300,20,73)
        tfreq = 1
        # lag = 0
        dates_RV = core_pp.get_subdates(pd.to_datetime(rg.fulltso.time.values),
                                       start_end_date=rg.start_end_TVdate)
        RV_ts = rg.fulltso.sel(time=dates_RV)
        ds_v300 = core_pp.import_ds_lazy(rg.list_precur_pp[1][1])
        dslocal = core_pp.get_selbox(ds_v300, selbox=selbox)



        datesRW = core_pp.get_subdates(pd.to_datetime(dslocal.time.values),
                                       start_end_date=rg.start_end_TVdate)
        datesRW = datesRW + pd.Timedelta(f'{lag}d')
        dslocal = dslocal.sel(time=datesRW)

        wv6local = core_pp.get_selbox(xarray.sel(lag=5), selbox=selbox)
        patternlocal = wv6local.mean(dim='lag')
        ts = find_precursors.calc_spatcov(dslocal, patternlocal)
        ts_15, d = functions_pp.time_mean_bins(ts, tfreq, start_end_date=start_end_TVdate,
Пример #6
0
rg.pp_precursors()

# In[ ]:

rg.list_precur_pp

var_filename = rg.list_precur_pp[0][1]
region = 'USCAnew'

#%%

import pandas as pd

ds = core_pp.import_ds_lazy(var_filename)
ds.sel(time=core_pp.get_subdates(pd.to_datetime(ds.time.values),
                                 start_end_date=('06-01', '08-31'))).mean(
                                     dim='time').plot()

#%%

if region == 'USCAnew':
    selbox = (230, 300, 25, 70)
    TVpath = os.path.join(path_data, 'tfreq15_nc7_dendo_57db0USCA.nc')
    # np_array_xy = np.array([[-97, 39], [-89, 39], [-82, 40],
    #                        [-116,36], [-122,41], [-117,46]])
    np_array_xy = np.array([[-96, 36], [-92, 41], [-84, 35], [-84, 41],
                            [-114, 36], [-120, 36], [-122, 44], [-118, 48]])
    t, c = 15, 7
# elif region == 'USCA':
#     selbox = (230, 300, 25, 70)
#     TVpath = os.path.join(path_outmain, 'tf10_nc5_dendo_5dbee_USCA.nc')
Пример #7
0
def import_precur_ts(list_import_ts: List[tuple],
                     df_splits: pd.DataFrame,
                     start_end_date: Tuple[str, str],
                     start_end_year: Tuple[int, int],
                     start_end_TVdate: Tuple[str, str],
                     cols: list = None,
                     precur_aggr: int = 1):
    '''
    list_import_ts has format List[tuples],
    [(name, path_data)]
    '''
    #%%
    # df_splits = rg.df_splits

    splits = df_splits.index.levels[0]
    orig_traintest = functions_pp.get_testyrs(df_splits)
    df_data_ext_s = np.zeros((splits.size), dtype=object)
    counter = 0
    for i, (name, path_data) in enumerate(list_import_ts):

        df_data_e_all = functions_pp.load_hdf5(path_data)['df_data']
        if type(df_data_e_all) is pd.Series:
            df_data_e_all = pd.DataFrame(df_data_e_all)

        df_data_e_all = df_data_e_all.iloc[:, :]  # not sure why needed
        if cols is None:
            cols = list(
                df_data_e_all.columns[(df_data_e_all.dtypes != bool).values])
        elif type(cols) is str:
            cols = [cols]

        if hasattr(df_data_e_all.index, 'levels'):
            dates_subset = core_pp.get_subdates(df_data_e_all.loc[0].index,
                                                start_end_date, start_end_year)
            df_data_e_all = df_data_e_all.loc[pd.IndexSlice[:,
                                                            dates_subset], :]
        else:
            dates_subset = core_pp.get_subdates(df_data_e_all.index,
                                                start_end_date, start_end_year)
            df_data_e_all = df_data_e_all.loc[dates_subset]

        if 'TrainIsTrue' in df_data_e_all.columns:
            _c = [
                k for k in df_splits.columns
                if k in ['TrainIsTrue', 'RV_mask']
            ]
            # check if traintest split is correct
            ext_traintest = functions_pp.get_testyrs(df_data_e_all[_c])
            _check_traintest = all(
                np.equal(core_pp.flatten(ext_traintest),
                         core_pp.flatten(orig_traintest)))
            assert _check_traintest, (
                'Train test years of df_splits are not the '
                'same as imported timeseries')

        for s in range(splits.size):
            if 'TrainIsTrue' in df_data_e_all.columns:
                df_data_e = df_data_e_all.loc[s]
            else:
                df_data_e = df_data_e_all

            df_data_ext_s[s] = df_data_e[cols]
            tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days

            if precur_aggr != tfreq_date_e:
                try:
                    df_data_ext_s[s] = functions_pp.time_mean_bins(
                        df_data_ext_s[s],
                        precur_aggr,
                        start_end_date,
                        start_end_year,
                        start_end_TVdate=start_end_TVdate)[0]
                except KeyError as e:
                    print('KeyError captured, likely the requested dates '
                          'given by start_end_date and start_end_year are not'
                          'found in external pandas timeseries.\n{}'.format(
                              str(e)))
        print(f'loaded in exterinal timeseres: {cols}')

        if counter == 0:
            df_data_ext = pd.concat(list(df_data_ext_s),
                                    keys=range(splits.size))
        else:
            df_add = pd.concat(list(df_data_ext_s), keys=range(splits.size))
            df_data_ext = df_data_ext.merge(df_add,
                                            left_index=True,
                                            right_index=True)
        counter += 1
        cols = None
    #%%
    return df_data_ext
Пример #8
0
    out = rgPDO.fit_df_data_ridge(target=target,
                                  df_data = df_prec,
                                  tau_min=0, tau_max=0,
                                  kwrgs_model={'alphas':np.array([.01,.1,1,5,10])})
    predict = out[0].rename({0:'AR1'}, axis=1)

    lowPDO, df_lagmask = get_lagged_ts(rgPDO.df_data.copy(), 1, ['PDO0.5rm'])
    # perPDO = rgPDO.df_data[keys_ext][persmask['x_fit']]
    # persPDO[persmask['x_fit']] = persPDO[persmask['x_fit']]
    # perPDO.index = rgPDO.df_data[rgPDO.df_data['RV_mask']].index
    perPDO = lowPDO.rename({'PDO1.0rm_2':'persistence'}, axis=1)
    perPDO = perPDO.loc[df_prec.index]
    predict = predict.merge(perPDO, left_index=True, right_index=True)

    dates = core_pp.get_subdates(rgPDO.dates_TV, start_end_year=(1980,2020))
    predict = predict.loc[pd.IndexSlice[:, dates], :]
    test = fc_utils.get_scores(predict,
                               score_func_list=[fc_utils.corrcoef,
                                                fc_utils.metrics.mean_squared_error])[2]
    df_test = functions_pp.get_df_test(predict,
                                       df_splits=rgPDO.df_data.loc[predict.index][['TrainIsTrue', 'RV_mask']])
    df_z = df_test[['AR1']]
    df_z = lowPDO
    # df_z = functions_pp.get_df_test(df_prec,
    #                                 df_splits=rgPDO.df_data.loc[predict.index][['TrainIsTrue', 'RV_mask']])
    # years = functions_pp.get_oneyr(df_z, *list(range(1980, 2020+1)))
    # df_z = df_z.loc[years]

    kwrgs_func = {'filepath':df_z,
                  'lag_z':0}
                              legend=True,
                              label=yr,
                              alpha=alpha)
    # ax.set_ylim(-.1,.1)
    ax.hlines(y=0.5, xmin=0.05, xmax=.95, transform=ax.transAxes)
    ax.set_xticks(range(df_yr.index.values.size))
    xticklabels = [
        '{} {}'.format(*list(item)) for item in df_yr.index.tolist()
    ]
    ax.set_xticklabels(xticklabels, rotation=-45)
    allyrs.append(list(df_yr['SM'].values))

#%%

summerdays = core_pp.get_subdates(df_T.mean(0, level=1).index,
                                  start_end_date=('08-01', '08-31'),
                                  start_end_year=(1980, 2018))
df_sum = df_T.mean(0, level=1).loc[summerdays]
summmerSM = df_sum['SM'].groupby(df_sum.index.year).mean()
winterdays = core_pp.get_subdates(df_SST[['SST pattern']].mean(0,
                                                               level=1).index,
                                  start_end_date=('01-01', '08-31'),
                                  start_end_year=(1979, 2017))
winterdays = functions_pp.func_dates_min_lag(winterdays, lag=92)[1]
df_win = df_SST[['SST pattern']].mean(0, level=1).loc[winterdays]
winterSST = df_win.groupby(df_win.index.year).mean().iloc[:-1]
falldays = core_pp.get_subdates(df_SST[['SST pattern']].mean(0, level=1).index,
                                start_end_date=('09-01', '12-31'),
                                start_end_year=(1980, 2018))
df_fall = df_SST[['SST pattern']].mean(0, level=1).loc[falldays]
fallSST = df_win.groupby(df_win.index.year).mean().iloc[1:]
      'BSS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['BSS']),
      'AUC {:.2f}'.format(df_train_m.mean(0).loc[0]['roc_auc_score']))




#%% Correlating both the gradient and absolute timeseries of ENSO with target
df_ENSO_s = df_ENSO.loc[0]
grad_w = 3
gap = 6
for month in range(1,13):
    grad_ENSO = df_ENSO_s.shift(int(1+grad_w/2+gap)).rolling(int(grad_w/2),
                                                       center=True,
                                         min_periods=1).mean() - \
                        df_ENSO_s.rolling(int(grad_w/2), center=True,
                                        min_periods=1).mean()
    X_dates = core_pp.get_subdates(df_ENSO_s.index,
                                   start_end_date=(f'{month}-01',f'{month}-28'),
                                   start_end_year=(1951, 2019))
    target_data = rg.TV_ts[1:].values
    # df_ENSO_s.loc[X_dates].plot()
    corr_grad = np.corrcoef(grad_ENSO.loc[X_dates].values.squeeze(),
                                             target_data)[0][1]
    corr_abs = np.corrcoef(df_ENSO_s.loc[X_dates].values.squeeze(),
                                             target_data)[0][1]
    print('{:02d}'.format(month),
          'Gradient ENSO {:.2f}\n'.format(corr_grad),
          '  Absolute values {:.2f}'.format(corr_abs) )


Пример #11
0
def ENSO_34(filepath, df_splits=None, get_ENSO_states: bool = True):
    #%%
    #    file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc'
    '''
    See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf
    selbox has format of (lon_min, lon_max, lat_min, lat_max)
    '''

    # if df_splits is None:
    #     seldates = None
    # else:
    #     seldates = df_splits.loc[0].index

    #    {'la_min':-5, # select domain in degrees east
    #     'la_max':5,
    #     'lo_min':-170,
    #     'lo_max':-120},

    kwrgs_pp = {
        'selbox': (190, 240, -5, 5),
        'format_lon': 'only_east',
        'seldates': None
    }

    ds = core_pp.import_ds_lazy(filepath, **kwrgs_pp)
    dates = pd.to_datetime(ds.time.values)
    data = functions_pp.area_weighted(ds).mean(dim=('latitude', 'longitude'))
    df_ENSO = pd.DataFrame(data=data.values, index=dates, columns=['ENSO34'])
    if df_splits is not None:
        splits = df_splits.index.levels[0]
        df_ENSO = pd.concat([df_ENSO] * splits.size, axis=0, keys=splits)

    if get_ENSO_states:
        '''
        From Anderson 2017 - Life cycles of agriculturally relevant ENSO
        teleconnections in North and South America.
        http://doi.wiley.com/10.1002/joc.4916
        mean boreal wintertime (October, November, December) SST anomaly amplitude
        in the Niño 3.4 region exceeded 1 of 2 standard deviation.
        '''
        if hasattr(df_ENSO.index, 'levels'):
            df_ENSO_s = df_ENSO.loc[0]
        else:
            df_ENSO_s = df_ENSO
        dates = df_ENSO_s.index
        df_3monthmean = df_ENSO_s.rolling(3, center=True, min_periods=1).mean()
        std_ENSO = df_3monthmean.std()
        OND, groups = core_pp.get_subdates(dates,
                                           start_end_date=('10-01', '12-31'),
                                           returngroups=True)
        OND_ENSO = df_3monthmean.loc[OND].groupby(groups).mean()
        nino_yrs = OND_ENSO[OND_ENSO > df_3monthmean.mean() +
                            std_ENSO][:].dropna().index  #+ 1
        nina_yrs = OND_ENSO[OND_ENSO < df_3monthmean.mean() -
                            std_ENSO][:].dropna().index  #+ 1
        neutral = [
            y for y in OND_ENSO.index
            if y not in core_pp.flatten([nina_yrs, nino_yrs])
        ]
        states = {}
        for i, d in enumerate(dates):
            if d.year in nina_yrs:
                states[d.year] = -1
            if d.year in neutral:
                states[d.year] = 0
            if d.year in nino_yrs:
                states[d.year] = 1

        cycle_list = []
        for s, v in [('EN', 1), ('LN', -1)]:
            ENSO_cycle = {d.year: 0 for d in dates}
            for i, year in enumerate(np.unique(dates.year)):
                # d = dates[1]
                # if states[year] == v:
                #     s = 'EN'
                # elif states[year] == -1:
                #     s = 'LN'
                if states[year] == v:
                    ENSO_cycle[year] = f'{s}0'
                    if year - 1 in dates.year and states[year - 1] != v:
                        ENSO_cycle[year - 1] = f'{s}-1'
                    if year + 1 in dates.year and states[year + 1] != v:
                        ENSO_cycle[year + 1] = f'{s}+1'
            cycle_list.append(ENSO_cycle)

        time_index = pd.to_datetime([f'{y}-01-01' for y in states.keys()])
        df_state = pd.concat([
            pd.Series(states),
            pd.Series(cycle_list[0]),
            pd.Series(cycle_list[1])
        ],
                             axis=1,
                             keys=['state', 'EN_cycle', 'LN_cycle'])
        df_state.index = time_index

        if hasattr(df_ENSO.index, 'levels'):  # copy to other traintest splits
            df_state = pd.concat([df_state] * splits.size, keys=splits)

        composites = np.zeros(3, dtype=object)
        for i, yrs in enumerate([nina_yrs, neutral, nino_yrs]):
            composite = [d for d in dates if d.year in yrs]
            composites[i] = ds.sel(time=composite).mean(dim='time')
        composites = xr.concat(composites, dim='state')
        composites['state'] = ['Nina', 'Neutral', 'Nino']

        plot_maps.plot_corr_maps(composites, row_dim='state', hspace=0.5)
        out = df_ENSO, [
            np.array(nina_yrs),
            np.array(neutral),
            np.array(nino_yrs)
        ], df_state
    else:
        out = df_ENSO
    #%%
    return out
Пример #12
0
                     ('12-01', '02-28'),
                     ('02-01', '05-30'),
                     ('06-01', '08-31')]

seasons = ['{annual}', '{DJF}', '{MAM}', '{JJA}']

f, ax = plt.subplots(len(seasons), figsize=(10,18), sharex=True)
only_summer = True

for p, startenddate in enumerate(start_end_TVdates):




    if only_summer:
        seldates = core_pp.get_subdates(df_RWE.index,
                                        start_end_date=startenddate)
        df_RWE_am = df_RWE.loc[seldates].groupby(seldates.year).mean()
    else:
        idx = max(0, p-1)
        _d = dfs[idx].mean(axis=0, level=1)
        seldates = core_pp.get_subdates(_d.index,
                                        start_end_date=startenddate)
        df_RWE_am = _d.loc[seldates].groupby(seldates.year).mean()


    seas = seasons[p]
    RWcolname = df_RWE_am.columns[0]
    df_RWE_am  = df_RWE_am.rename({RWcolname: f'$RW_{seas}^E$'}, axis=1)
    df_merge = df_PDO_am.merge(df_RWE_am, left_index=True, right_index=True)
    df_merge  = (df_merge - df_merge.mean(0) ) / df_merge.std(0)