Exemplo n.º 1
0
def pers_ano_to_extr(filename_ts, RV, kwrgs_events_daily, dict_experiments,
                     name_exp, name_model, n_boot):

    # loading in daily timeseries
    RVfullts = np.load(filename_ts, encoding='latin1',
                       allow_pickle=True).item()['RVfullts95']

    # Retrieve information on input timeseries
    import functions_pp
    dates = functions_pp.get_oneyr(RV.RV_ts.index)
    tfreq = (dates[1] - dates[0]).days
    start_date = dates[0] - pd.Timedelta(f'{tfreq/2}d')
    end_date = dates[-1] + pd.Timedelta(f'{-1+tfreq/2}d')
    yr_daily = pd.DatetimeIndex(start=start_date,
                                end=end_date,
                                freq=pd.Timedelta('1d'))
    ext_dates = functions_pp.make_dates(RV.RV_ts.index, yr_daily,
                                        RV.RV_ts.index.year[-1])

    df_RV_ts_e = pd.DataFrame(RVfullts.sel(time=ext_dates).values,
                              index=ext_dates,
                              columns=['RV_ts'])
    df_RVfullts = pd.DataFrame(RVfullts.values,
                               index=pd.to_datetime(RVfullts.time.values),
                               columns=['RVfullts'])

    # Make new class based on new kwrgs_events_daily
    RV_d = func_fc.RV_class(df_RVfullts, df_RV_ts_e, kwrgs_events_daily)
    # Ensure that the bins on the daily time series matches the original
    ex = dict(sstartdate=f'{yr_daily[0].month}-{yr_daily[0].day}',
              senddate=f'{yr_daily[-1].month}-{yr_daily[-1].day}',
              startyear=ext_dates.year[0],
              endyear=ext_dates.year[-1])
    RV_d.RV_bin, dates_gr = functions_pp.time_mean_bins(RV_d.RV_bin, ex, tfreq)
    RV_d.RV_bin[RV_d.RV_bin > 0] = 1
    RV_d.TrainIsTrue = RV.TrainIsTrue
    RV_d.RV_mask = RV.RV_mask
    # add new probability of event occurence
    RV_d.prob_clim = func_fc.get_obs_clim(RV_d)

    dict_comparison = {}
    # loading model predicting pers. anomalies
    orig_event_perc = np.round(1 - float(RV.prob_clim.mean()), 2)
    new_name = '{}d mean +{}p to +{}p events'.format(
        tfreq, orig_event_perc, kwrgs_events_daily['event_percentile'])

    dict_sum = dict_experiments[name_exp]
    df_valid, RV, y_pred = dict_sum[models[-1]]

    blocksize = valid.get_bstrap_size(RV.RVfullts, plot=False)
    out = valid.get_metrics_sklearn(RV_d,
                                    y_pred,
                                    RV_d.prob_clim,
                                    n_boot=n_boot,
                                    blocksize=blocksize)
    df_valid, metrics_dict = out
    dict_comparison[new_name] = {name_model: (df_valid, RV_d, y_pred)}
    return dict_comparison
Exemplo n.º 2
0
def _daily_to_aggr(df_data, daily_to_aggr=int):
    import functions_pp
    if hasattr(df_data.index, 'levels'):
        splits = df_data.index.levels[0]
        df_data_s   = np.zeros( (splits.size) , dtype=object)
        for s in splits:
            df_data_s[s], dates_tobin = functions_pp.time_mean_bins(
                                                        df_data.loc[s],
                                                        tfreq=daily_to_aggr,
                                                        start_end_date=None,
                                                        start_end_year=None,
                                                        verbosity=0)
        df_data_resample  = pd.concat(list(df_data_s), keys= range(splits.size))
    else:
        df_data_resample, dates_tobin = functions_pp.time_mean_bins(df_data,
                                                       tfreq=daily_to_aggr,
                                                       start_end_date=None,
                                                       start_end_year=None,
                                                       verbosity=0)
    return df_data_resample, dates_tobin
Exemplo n.º 3
0
def ENSO_34(file_path, ex, df_splits=None):
    #%%
    #    file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc'
    '''
    See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf    
    '''
    if df_splits is None:
        RV = ex[ex['RV_name']]
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex)
        seldates = None
    else:
        seldates = df_splits.loc[0].index

    kwrgs_pp = {
        'selbox': {
            'la_min': -5,  # select domain in degrees east
            'la_max': 5,
            'lo_min': -170,
            'lo_max': -120
        },
        'seldates': seldates
    }

    ds = core_pp.import_ds_lazy(file_path, **kwrgs_pp)

    to_freq = ex['tfreq']
    if to_freq != 1:
        ds, dates = functions_pp.time_mean_bins(ds,
                                                ex,
                                                to_freq=to_freq,
                                                seldays='all')
        ds['time'] = dates

    dates = pd.to_datetime(ds.time.values)
    splits = df_splits.index.levels[0]

    list_splits = []
    for s in splits:

        progress = 100 * (s + 1) / splits.size
        print(f"\rProgress ENSO traintest set {progress}%)", end="")

        data = functions_pp.area_weighted(ds).mean(dim=('latitude',
                                                        'longitude'))

        list_splits.append(
            pd.DataFrame(data=data.values,
                         index=dates,
                         columns=['0_900_ENSO34']))

    df_ENSO = pd.concat(list_splits, axis=0, keys=splits)
    #%%
    return df_ENSO
Exemplo n.º 4
0
def import_precur_ts(import_prec_ts, df_splits, to_freq, start_end_date,
                     start_end_year):
    '''
    import_prec_ts has format tuple (name, path_data)
    '''
    splits = df_splits.index.levels[0]
    df_data_ext_s   = np.zeros( (splits.size) , dtype=object)
    counter = 0
    for i, (name, path_data) in enumerate(import_prec_ts):
        for s in range(splits.size):
            # skip first col because it is the RV ts
            df_data_e = func_fc.load_hdf5(path_data)['df_data'].iloc[:,1:].loc[s]
            cols_ts = np.logical_or(df_data_e.dtypes == 'float64', df_data_e.dtypes == 'float32')
            cols_ext = list(df_data_e.columns[cols_ts])
            # cols_ext must be of format '{}_{int}_{}'
            lab_int = 100
            for i, c in enumerate(cols_ext):
                char = c.split('_')[1]
                if char.isdigit():
                    pass
                else:
                    cols_ext[i] = c.replace(char, str(lab_int)) + char
                    lab_int += 1
                    
            df_data_ext_s[s] = df_data_e[cols_ext]
            tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days
            
            if to_freq != tfreq_date_e:
                try:
                    df_data_ext_s[s] = functions_pp.time_mean_bins(df_data_ext_s[s], 
                                                         to_freq,
                                                        start_end_date,
                                                        start_end_year)[0]
                except KeyError as e:
                    print('KeyError captured, likely the requested dates '
                          'given by start_end_date and start_end_year are not' 
                          'found in external pandas timeseries.\n{}'.format(str(e)))
                                                        
        if counter == 0:
            df_data_ext = pd.concat(list(df_data_ext_s), keys=range(splits.size))
        else:
            df_data_ext.merge(df_data_ext, left_index=True, right_index=True)
    return df_data_ext
Exemplo n.º 5
0
    def __init__(self,
                 fullts: pd.DataFrame,
                 RV_ts: pd.DataFrame,
                 kwrgs_events: Union[dict, tuple],
                 only_RV_events: bool = True,
                 fit_model_dates: Tuple[str, str] = None):
        '''
        only_RV_events : bool. Decides whether to calculate the RV_bin on the
        whole fullts timeseries, or only on RV_ts
        '''
        #%%
        #        self.RV_ts = pd.DataFrame(df_data[df_data.columns[0]][0][df_data['RV_mask'][0]] )
        #        self.fullts = pd.DataFrame(df_data[df_data.columns[0]][0])
        self.name = fullts.columns[0]
        self.RV_ts = RV_ts
        self.fullts = fullts
        self.dates_all = fullts.index
        self.dates_RV = RV_ts.index
        self.n_oneRVyr = self.dates_RV[self.dates_RV.year ==
                                       self.dates_RV.year[0]].size
        nonleap = self.dates_all[~self.dates_all.is_leap_year]
        self.tfreq = (nonleap[1] - nonleap[0]).days
        if self.tfreq != 365 or self.tfreq != 1:
            self.dates_tobin = self.aggr_to_daily_dates(self.dates_RV,
                                                        tfreq=self.tfreq)

        def handle_fit_model_dates(dates_RV, dates_all, RV_ts,
                                   fit_model_dates):
            if fit_model_dates is None:
                # RV_ts and RV_ts_fit are equal if fit_model_dates = None
                bool_mask = [
                    True if d in dates_RV else False for d in dates_all
                ]
                fit_model_mask = pd.DataFrame(bool_mask,
                                              columns=['fit_model_mask'],
                                              index=dates_all)
                RV_ts_fit = RV_ts
                fit_dates = dates_RV
            else:
                startperiod, endperiod = fit_model_dates
                startyr = dates_all[0].year
                endyr = dates_all[-1].year
                #                if dates_all.resolution == 'day':
                #                    tfreq = (dates_all[1] - dates_all[0]).days
                start_end_date = (startperiod, endperiod)
                start_end_year = (startyr, endyr)
                fit_dates = core_pp.get_subdates(dates_all,
                                                 start_end_date=start_end_date,
                                                 start_end_year=start_end_year)
                bool_mask = [
                    True if d in fit_dates else False for d in dates_all
                ]
                fit_model_mask = pd.DataFrame(bool_mask,
                                              columns=['fit_model_mask'],
                                              index=dates_all)

                RV_ts_fit = fullts[fit_model_mask.values]
                fit_dates = fit_dates
            return fit_model_mask, fit_dates, RV_ts_fit

        out = handle_fit_model_dates(self.dates_RV, self.dates_all, self.RV_ts,
                                     fit_model_dates)
        self.fit_model_mask, self.fit_dates, self.RV_ts_fit = out

        if kwrgs_events is not None:
            # make RV_bin for events based on aggregated daymeans
            if kwrgs_events['window'] == 'mean':
                # RV_ts and RV_ts_fit are equal if fit_model_dates = None
                self.threshold = Ev_threshold(self.RV_ts,
                                              kwrgs_events['event_percentile'])
                self.threshold_ts_fit = Ev_threshold(
                    self.RV_ts_fit, kwrgs_events['event_percentile'])

                # unpack other optional arguments for defining event timeseries
                redun_keys = ['event_percentile', 'window']
                kwrgs = {
                    key: item
                    for key, item in kwrgs_events.items()
                    if key not in redun_keys
                }

                if only_RV_events == True:
                    out = Ev_timeseries(self.RV_ts_fit,
                                        threshold=self.threshold_ts_fit,
                                        **kwrgs)
                    self.RV_bin_fit, self.RV_dur = out
                    self.RV_bin = self.RV_bin_fit.loc[self.dates_RV]
                elif only_RV_events == False:
                    out = Ev_timeseries(self.fullts,
                                        threshold=self.threshold,
                                        **kwrgs)
                    self.RV_b_full, self.RV_dur = out
                    self.RV_bin = self.RV_b_full.loc[self.dates_RV]

                self.freq_per_year = RV_class.get_freq_years(self)

            # make RV_bin for extreme occurring in time window
            if type(kwrgs_events['window']) is pd.DataFrame:

                fullts = kwrgs_events['window']
                dates_RVe = self.aggr_to_daily_dates(self.dates_RV,
                                                     tfreq=self.tfreq)
                dates_alle = self.aggr_to_daily_dates(self.dates_all,
                                                      tfreq=self.tfreq)

                self.df_RV_ts_e = fullts.loc[dates_RVe]
                df_fullts_e = fullts.loc[dates_alle]

                out = handle_fit_model_dates(dates_RVe, dates_alle,
                                             self.df_RV_ts_e, fit_model_dates)
                self.fit_model_mask, self.fit_dates, self.RV_ts_fit_e = out

                # RV_ts and RV_ts_fit are equal if fit_model_dates = None
                self.threshold = Ev_threshold(self.df_RV_ts_e,
                                              kwrgs_events['event_percentile'])
                self.threshold_ts_fit = Ev_threshold(
                    self.RV_ts_fit_e, kwrgs_events['event_percentile'])

                # unpack other optional arguments for defining event timeseries
                redun_keys = ['event_percentile', 'window']
                kwrgs = {
                    key: item
                    for key, item in kwrgs_events.items()
                    if key not in redun_keys
                }

                if only_RV_events == True:
                    # RV_bin_fit is defined such taht we can fit on RV_bin_fit
                    # but validate on RV_bin
                    out = Ev_timeseries(self.df_RV_ts_e,
                                        threshold=self.threshold_ts_fit,
                                        **kwrgs)
                    self.RV_bin_fit_e, self.RV_dur = out
                    self.RV_bin_e = self.RV_bin_fit_e.loc[dates_RVe]
                elif only_RV_events == False:
                    print('check code, not supported yet')

                # convert daily binary to window binary
                if self.tfreq != 1:
                    self.RV_bin, dates_gr = functions_pp.time_mean_bins(
                        self.RV_bin_e.astype('float'), self.tfreq, None, None)
                    self.RV_bin_fit, dates_gr = functions_pp.time_mean_bins(
                        self.RV_bin_fit_e.astype('float'), self.tfreq, None,
                        None)
                else:
                    print(
                        'Warning: tfreq must be larger than 1 to calculate the window binary'
                    )

                # all bins, with mean > 0 contained an 'extreme' event
                self.RV_bin_fit[self.RV_bin_fit > 0] = 1
                self.RV_bin[self.RV_bin > 0] = 1
Exemplo n.º 6
0
def spatial_valid(var_filename,
                  mask,
                  y_pred_all,
                  y_pred_c,
                  lags_i=None,
                  seldates=None,
                  clusters=None,
                  kwrgs_events=None,
                  alpha=0.05,
                  n_boot=0,
                  blocksize=10,
                  threshold_pred='upper_clim'):
    '''
    var_filename must be 3d netcdf file with only one variable
    mask can be nc file containing only a mask, or a latlon box in format
    [west_lon, east_lon, south_lat, north_lat] in format in common west-east degrees 
    '''
    var_filename = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/preprocessed/t2mmax_US_1979-2018_1jan_31dec_daily_0.25deg.nc'
    mask = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/preprocessed/cluster_output.nc'

    if lags_i is None:
        lags_i = list(y_pred_all.columns)

    # load in daily xarray and mask
    xarray = core_pp.import_ds_lazy(var_filename)
    npmask = cl.get_spatial_ma(var_filename, mask)

    # process temporal infor
    freq = (y_pred_c.index[1] - y_pred_c.index[0]).days
    if seldates is None:
        seldates = aggr_to_daily_dates(y_pred_c.index)
        start = f'{seldates[0].month}-{seldates[0].day}'
        end = f'{seldates[-1].month}-{seldates[-1].day}'
        start_end_date = (start, end)
    xarray, dates = functions_pp.time_mean_bins(xarray,
                                                to_freq=freq,
                                                start_end_date=start_end_date)

    # if switching to event timeseries:
    if kwrgs_events is None:
        kwrgs_events = {'event_percentile': 66}
    # unpack other optional arguments for defining event timeseries
    kwrgs = {
        key: item
        for key, item in kwrgs_events.items() if key != 'event_percentile'
    }

    if clusters is None:
        clusters = list(np.unique(npmask[~np.isnan(npmask)]))
    elif type(clusters) is int:
        clusters = [clusters]
    elif clusters is not None:
        clusters = clusters

    dict_allclus = {}
    for clus in clusters:

        latloni = np.where(npmask == clus)
        latloni = [(latloni[0][i], latloni[1][i])
                   for i in range(latloni[0].size)]

        futures = {}
        with ProcessPoolExecutor(max_workers=max_cpu) as pool:

            for ll in latloni:
                latloni = latloni
                xr_gridcell = xarray.isel(latitude=ll[0]).isel(longitude=ll[1])
                threshold = func_fc.Ev_threshold(
                    xr_gridcell, kwrgs_events['event_percentile'])
                y_i = func_fc.Ev_timeseries(xr_gridcell, threshold, **kwrgs)[0]

                futures[ll] = pool.submit(valid.get_metrics_sklearn,
                                          y_i.values,
                                          y_pred_all[lags_i],
                                          y_pred_c,
                                          alpha=alpha,
                                          n_boot=n_boot,
                                          blocksize=blocksize,
                                          threshold_pred=threshold_pred)
        results = {key: future.result() for key, future in futures.items()}
        dict_allclus[clus] = results

    df_valid = dict_allclus[clus][ll][0]
    metrics = np.unique(df_valid.index.get_level_values(0))
    lags_tf = [l * freq for l in lags_i]
    if freq != 1:
        # the last day of the time mean bin is tfreq/2 later then the centerered day
        lags_tf = [
            l_tf - int(freq / 2) if l_tf != 0 else 0 for l_tf in lags_tf
        ]

    for clus in clusters:
        results = dict_allclus[clus]
        xroutput = xarray.isel(time=lags_i).rename({'time': 'lag'})
        xroutput['lag'] = lags_tf
        xroutput = xroutput.expand_dims({'metric': metrics}, 0)
        npdata = np.array(np.zeros_like(xroutput), dtype='float32')
        for ll in latloni:
            df_valid = dict_allclus[clus][ll][0]
            for i, met in enumerate(metrics):
                lat_i = ll[0]
                lon_i = ll[1]
                npdata[i, :, lat_i, lon_i] = df_valid.loc[met].loc[met]
        xroutput.values = npdata

    plot_maps.plot_corr_maps(xroutput.where(npmask == clus),
                             row_dim='metric',
                             size=4,
                             clevels=np.arange(-1, 1.1, 0.2))
    BSS = xroutput.where(npmask == clus).sel(metric='BSS')
    plot_maps.plot_corr_maps(BSS,
                             row_dim='metric',
                             size=4,
                             clevels=np.arange(-0.25, 0.251, 0.05),
                             cbar_vert=-0.1)
Exemplo n.º 7
0
                                       start_end_date=rg.start_end_TVdate)
        RV_ts = rg.fulltso.sel(time=dates_RV)
        ds_v300 = core_pp.import_ds_lazy(rg.list_precur_pp[1][1])
        dslocal = core_pp.get_selbox(ds_v300, selbox=selbox)



        datesRW = core_pp.get_subdates(pd.to_datetime(dslocal.time.values),
                                       start_end_date=rg.start_end_TVdate)
        datesRW = datesRW + pd.Timedelta(f'{lag}d')
        dslocal = dslocal.sel(time=datesRW)

        wv6local = core_pp.get_selbox(xarray.sel(lag=5), selbox=selbox)
        patternlocal = wv6local.mean(dim='lag')
        ts = find_precursors.calc_spatcov(dslocal, patternlocal)
        ts_15, d = functions_pp.time_mean_bins(ts, tfreq, start_end_date=start_end_TVdate,
                                                   closed_on_date=start_end_TVdate[-1])
        RV_15, d = functions_pp.time_mean_bins(RV_ts, tfreq, start_end_date=start_end_TVdate,
                                                   closed_on_date=start_end_TVdate[-1])
        corr_value = np.corrcoef(ts_15.values.squeeze(), RV_15.values.squeeze())[0][1]
        print('corr: {:.2f}'.format(corr_value))
        values.append(corr_value)
    plt.plot(range(-9,10), values[1:])
    # df_wv6 = ts_15.to_dataframe(name='wv6p2')
#%%
sst = rg.list_for_MI[2]

dates_years = functions_pp.get_oneyr(sst.df_splits.loc[0].index, *event_dates.year)
sst.precur_arr.sel(time=dates_years).mean(dim='time').plot(vmin=-.3, vmax=.3,
                                                           cmap=plt.cm.RdBu_r)
Exemplo n.º 8
0
def import_precur_ts(list_import_ts: List[tuple],
                     df_splits: pd.DataFrame,
                     start_end_date: Tuple[str, str],
                     start_end_year: Tuple[int, int],
                     start_end_TVdate: Tuple[str, str],
                     cols: list = None,
                     precur_aggr: int = 1):
    '''
    list_import_ts has format List[tuples],
    [(name, path_data)]
    '''
    #%%
    # df_splits = rg.df_splits

    splits = df_splits.index.levels[0]
    orig_traintest = functions_pp.get_testyrs(df_splits)
    df_data_ext_s = np.zeros((splits.size), dtype=object)
    counter = 0
    for i, (name, path_data) in enumerate(list_import_ts):

        df_data_e_all = functions_pp.load_hdf5(path_data)['df_data']
        if type(df_data_e_all) is pd.Series:
            df_data_e_all = pd.DataFrame(df_data_e_all)

        df_data_e_all = df_data_e_all.iloc[:, :]  # not sure why needed
        if cols is None:
            cols = list(
                df_data_e_all.columns[(df_data_e_all.dtypes != bool).values])
        elif type(cols) is str:
            cols = [cols]

        if hasattr(df_data_e_all.index, 'levels'):
            dates_subset = core_pp.get_subdates(df_data_e_all.loc[0].index,
                                                start_end_date, start_end_year)
            df_data_e_all = df_data_e_all.loc[pd.IndexSlice[:,
                                                            dates_subset], :]
        else:
            dates_subset = core_pp.get_subdates(df_data_e_all.index,
                                                start_end_date, start_end_year)
            df_data_e_all = df_data_e_all.loc[dates_subset]

        if 'TrainIsTrue' in df_data_e_all.columns:
            _c = [
                k for k in df_splits.columns
                if k in ['TrainIsTrue', 'RV_mask']
            ]
            # check if traintest split is correct
            ext_traintest = functions_pp.get_testyrs(df_data_e_all[_c])
            _check_traintest = all(
                np.equal(core_pp.flatten(ext_traintest),
                         core_pp.flatten(orig_traintest)))
            assert _check_traintest, (
                'Train test years of df_splits are not the '
                'same as imported timeseries')

        for s in range(splits.size):
            if 'TrainIsTrue' in df_data_e_all.columns:
                df_data_e = df_data_e_all.loc[s]
            else:
                df_data_e = df_data_e_all

            df_data_ext_s[s] = df_data_e[cols]
            tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days

            if precur_aggr != tfreq_date_e:
                try:
                    df_data_ext_s[s] = functions_pp.time_mean_bins(
                        df_data_ext_s[s],
                        precur_aggr,
                        start_end_date,
                        start_end_year,
                        start_end_TVdate=start_end_TVdate)[0]
                except KeyError as e:
                    print('KeyError captured, likely the requested dates '
                          'given by start_end_date and start_end_year are not'
                          'found in external pandas timeseries.\n{}'.format(
                              str(e)))
        print(f'loaded in exterinal timeseres: {cols}')

        if counter == 0:
            df_data_ext = pd.concat(list(df_data_ext_s),
                                    keys=range(splits.size))
        else:
            df_add = pd.concat(list(df_data_ext_s), keys=range(splits.size))
            df_data_ext = df_data_ext.merge(df_add,
                                            left_index=True,
                                            right_index=True)
        counter += 1
        cols = None
    #%%
    return df_data_ext
Exemplo n.º 9
0
def run_PCMCI(ex, outdic_actors, s, df_splits, map_proj):
    #=====================================================================================
    #
    # 4) PCMCI-algorithm
    #
    #=====================================================================================

    # save output
    if ex['SaveTF'] == True:
        #        from contextlib import redirect_stdout
        orig_stdout = sys.stdout
        # buffer print statement output to f
        if sys.version[:1] == '3':
            sys.stdout = f = io.StringIO()
        elif sys.version[:1] == '2':
            sys.stdout = f = open(os.path.join(ex['fig_subpath'], 'old.txt'),
                                  'w+')

#%%
# amount of text printed:
    verbosity = 3

    # alpha level for independence test within the pc procedure (finding parents)
    pc_alpha = ex['pcA_sets'][ex['pcA_set']]
    # alpha level for multiple linear regression model while conditining on parents of
    # parents
    alpha_level = ex['alpha_level_tig']
    print('run tigramite 4, run.pcmci')
    print(('alpha level(s) for independence tests within the pc procedure'
           '(finding parents): {}'.format(pc_alpha)))
    print((
        'alpha level for multiple linear regression model while conditining on parents of '
        'parents: {}'.format(ex['alpha_level_tig'])))

    # Retrieve traintest info
    traintest = df_splits

    # load Response Variable class
    RV = ex[ex['RV_name']]
    # create list with all actors, these will be merged into the fulldata array
    allvar = ex['vars'][0]
    var_names_corr = []
    actorlist = []
    cols = [[RV.name]]

    for var in allvar[:]:
        print(var)
        actor = outdic_actors[var]
        if actor.ts_corr[s].size != 0:
            ts_train = actor.ts_corr[s].values
            actorlist.append(ts_train)
            # create array which numbers the regions
            var_idx = allvar.index(var)
            n_regions = actor.ts_corr[s].shape[1]
            actor.var_info = [[i + 1, actor.ts_corr[s].columns[i], var_idx]
                              for i in range(n_regions)]
            # Array of corresponing regions with var_names_corr (first entry is RV)
            var_names_corr = var_names_corr + actor.var_info
            cols.append(list(actor.ts_corr[s].columns))
            index_dates = actor.ts_corr[s].index
    var_names_corr.insert(0, RV.name)

    # stack actor time-series together:
    fulldata = np.concatenate(tuple(actorlist), axis=1)

    print(('There are {} regions in total'.format(fulldata.shape[1])))
    # add the full 1D time series of interest as first entry:

    fulldata = np.column_stack((RV.RVfullts, fulldata))
    df_data = pd.DataFrame(fulldata, columns=flatten(cols), index=index_dates)

    if ex['import_prec_ts'] == True:
        var_names_full = var_names_corr.copy()
        for d in ex['precursor_ts']:
            path_data = d[1]
            if len(path_data) > 1:
                path_data = ''.join(list(path_data))
            # skip first col because it is the RV ts
            df_data_ext = func_fc.load_hdf5(
                path_data)['df_data'].iloc[:, 1:].loc[s]
            cols_ts = np.logical_or(df_data_ext.dtypes == 'float64',
                                    df_data_ext.dtypes == 'float32')
            cols_ext = list(df_data_ext.columns[cols_ts])
            # cols_ext must be of format '{}_{int}_{}'
            lab_int = 100
            for i, c in enumerate(cols_ext):
                char = c.split('_')[1]
                if char.isdigit():
                    pass
                else:
                    cols_ext[i] = c.replace(char, str(lab_int)) + char
                    lab_int += 1

            df_data_ext = df_data_ext[cols_ext]
            to_freq = ex['tfreq']
            if to_freq != 1:
                start_end_date = (ex['sstartdate'], ex['senddate'])
                start_end_year = (ex['startyear'], ex['endyear'])
            df_data_ext = functions_pp.time_mean_bins(df_data_ext,
                                                      to_freq,
                                                      start_end_date,
                                                      start_end_year,
                                                      seldays='part')[0]
            #            df_data_ext = functions_pp.time_mean_bins(df_data_ext,
            #                                                     ex, ex['tfreq'],
            #                                                     seldays='part')[0]
            # Expand var_names_corr
            n = var_names_full[-1][0] + 1
            add_n = n + len(cols_ext)
            n_var_idx = var_names_full[-1][-1] + 1
            for i in range(n, add_n):
                var_names_full.append([i, cols_ext[i - n], n_var_idx])
            df_data = df_data.merge(df_data_ext,
                                    left_index=True,
                                    right_index=True)
    else:
        var_names_full = var_names_corr

    bool_train = traintest.loc[s]['TrainIsTrue']
    bool_RV_train = np.logical_and(bool_train, traintest.loc[s]['RV_mask'])
    dates_train = traintest.loc[s]['TrainIsTrue'][bool_train].index
    dates_RV_train = traintest.loc[s]['TrainIsTrue'][bool_RV_train].index

    RVfull_train = RV.RVfullts.sel(time=dates_train)
    datesfull_train = pd.to_datetime(RVfull_train.time.values)
    data = df_data.loc[datesfull_train].values
    print((data.shape))

    # get RV datamask (same shape als data)
    data_mask = [
        True if d in dates_RV_train else False for d in datesfull_train
    ]
    data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape)

    # add traintest mask to fulldata
    #    dates_all = pd.to_datetime(RV.RVfullts.index)
    #    dates_RV  = pd.to_datetime(RV.RV_ts.index)
    dates_all = pd.to_datetime(RV.RVfullts.time.values)
    dates_RV = pd.to_datetime(RV.RV_ts.time.values)
    df_data['TrainIsTrue'] = [
        True if d in datesfull_train else False for d in dates_all
    ]
    df_data['RV_mask'] = [True if d in dates_RV else False for d in dates_all]

    # ======================================================================================================================
    # tigramite 3
    # ======================================================================================================================

    T, N = data.shape  # Time, Regions
    # ======================================================================================================================
    # Initialize dataframe object (needed for tigramite functions)
    # ======================================================================================================================
    dataframe = pp.DataFrame(data=data,
                             mask=data_mask,
                             var_names=var_names_full)
    # ======================================================================================================================
    # pc algorithm: only parents for selected_variables are calculated
    # ======================================================================================================================

    parcorr = ParCorr(significance='analytic',
                      mask_type='y',
                      verbosity=verbosity)
    #==========================================================================
    # multiple testing problem:
    #==========================================================================
    pcmci = PCMCI(dataframe=dataframe,
                  cond_ind_test=parcorr,
                  selected_variables=None,
                  verbosity=4)

    # selected_variables : list of integers, optional (default: range(N))
    #    Specify to estimate parents only for selected variables. If None is
    #    passed, parents are estimated for all variables.

    # ======================================================================================================================
    #selected_links = dictionary/None
    results = pcmci.run_pcmci(tau_max=ex['tigr_tau_max'],
                              pc_alpha=pc_alpha,
                              tau_min=0,
                              max_combinations=ex['max_comb_actors'])

    q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'],
                                           fdr_method='fdr_bh')

    pcmci.print_significant_links(p_matrix=results['p_matrix'],
                                  q_matrix=q_matrix,
                                  val_matrix=results['val_matrix'],
                                  alpha_level=alpha_level)

    # returns all parents, not just causal precursors (of lag>0)
    sig = rgcpd.return_sign_parents(pcmci,
                                    pq_matrix=q_matrix,
                                    val_matrix=results['val_matrix'],
                                    alpha_level=alpha_level)

    all_parents = sig['parents']
    #    link_matrix = sig['link_matrix']

    links_RV = all_parents[0]

    df = rgcpd.bookkeeping_precursors(links_RV, var_names_full)
    #%%

    rgcpd.print_particular_region_new(links_RV, var_names_corr, s,
                                      outdic_actors, map_proj, ex)

    #%%
    if ex['SaveTF'] == True:
        if sys.version[:1] == '3':
            fname = f's{s}_' + ex['params'] + '.txt'
            file = io.open(os.path.join(ex['fig_subpath'], fname), mode='w+')
            file.write(f.getvalue())
            file.close()
            f.close()
        elif sys.version[:1] == '2':
            f.close()
        sys.stdout = orig_stdout

    return df, df_data
Exemplo n.º 10
0
def plot_ts_matric(df_init, win: int=None, lag=0, columns: list=None, rename: dict=None,
                   period='fullyear', plot_sign_stars=True, fontsizescaler=0):
    #%%
    '''
    period = ['fullyear', 'summer60days', 'pre60days']
    '''
    if columns is None:
        columns = list(df_init.columns[(df_init.dtypes != bool).values])


    df_cols = df_init[columns]


    if hasattr(df_init.index, 'levels'):
        splits = df_init.index.levels[0]
        print('extracting RV dates from test set')
        dates_RV_orig   = df_init.loc[0].index[df_init.loc[0]['RV_mask']==True]
        TrainIsTrue = df_init['TrainIsTrue']
        dates_full_orig = df_init.loc[0].index
        list_test = []
        for s in range(splits.size):
            TestIsTrue = TrainIsTrue[s]==False
            list_test.append(df_cols.loc[s][TestIsTrue])
        df_test = pd.concat(list_test).sort_index()
    else:
        df_test = df_init
        dates_full_orig = df_init.index

    if lag != 0:
        # shift precursor vs. tmax
        for c in df_test.columns[1:]:
            df_test[c] = df_test[c].shift(periods=-lag)

    # bin means
    if win is not None:
        oneyr = get_oneyr(df_test.index)
        start_end_date = (f'{oneyr[0].month:02d}-{oneyr[0].day:02d}',
                          f'{oneyr[-1].month:02d}-{oneyr[-1].day:02d}')
        df_test = time_mean_bins(df_test, win, start_end_date=start_end_date)[0]


    if period=='fullyear':
        dates_sel = dates_full_orig.strftime('%Y-%m-%d')
    if 'RV_mask' in df_init.columns:
        if period == 'RV_mask':
            dates_sel = dates_RV_orig.strftime('%Y-%m-%d')
        elif period == 'RM_mask_lag60':
            dates_sel = (dates_RV_orig - pd.Timedelta(60, unit='d')).strftime('%Y-%m-%d')

    # after resampling, not all dates are in their:
    dates_sel =  pd.to_datetime([d for d in dates_sel if d in df_test.index] )
    df_period = df_test.loc[dates_sel, :].dropna()

    if rename is not None:
        df_period = df_period.rename(rename, axis=1)

    corr, sig_mask, pvals = corr_matrix_pval(df_period, alpha=0.01)

    # Generate a mask for the upper triangle
    mask_tri = np.zeros_like(corr, dtype=np.bool)
    mask_tri[np.triu_indices_from(mask_tri)] = True
    mask_sig = mask_tri.copy()
    mask_sig[sig_mask==False] = True

    # removing meaningless row and column
    cols = corr.columns
    corr = corr.drop(cols[0], axis=0).drop(cols[-1], axis=1)
    mask_sig = mask_sig[1:, :-1]
    mask_tri = mask_tri[1:, :-1]
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(10, 10))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, n=9, l=30, as_cmap=True)

    ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1E99, center=0,
                square=True, linewidths=.5,
                 annot=False, annot_kws={'size':30+fontsizescaler}, cbar=False)

    if plot_sign_stars:
        sig_bold_labels = sig_bold_annot(corr, mask_sig)
    else:
        sig_bold_labels = corr.round(2).astype(str).values
    # Draw the heatmap with the mask and correct aspect ratio
    ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .8},
                 annot=sig_bold_labels, annot_kws={'size':30+fontsizescaler}, cbar=False, fmt='s')

    ax.tick_params(axis='both', labelsize=15+fontsizescaler,
                   bottom=True, top=False, left=True, right=False,
                   labelbottom=True, labeltop=False, labelleft=True, labelright=False)

    ax.set_xticklabels(corr.columns, fontdict={'fontweight':'bold',
                                               'fontsize':20+fontsizescaler})
    ax.set_yticklabels(corr.index, fontdict={'fontweight':'bold',
                                               'fontsize':20+fontsizescaler}, rotation=0)
    #%%
    return
Exemplo n.º 11
0
    def __init__(self,
                 fullts,
                 RV_ts,
                 kwrgs_events=None,
                 only_RV_events=True,
                 fit_model_dates=None):
        '''
        only_RV_events : bool. Decides whether to calculate the RV_bin on the
        whole fullts timeseries, or only on RV_ts
        '''
        #%%
        #        self.RV_ts = pd.DataFrame(df_data[df_data.columns[0]][0][df_data['RV_mask'][0]] )
        #        self.fullts = pd.DataFrame(df_data[df_data.columns[0]][0])
        self.RV_ts = RV_ts
        self.fullts = fullts
        self.dates_all = fullts.index
        self.dates_RV = RV_ts.index
        self.n_oneRVyr = self.dates_RV[self.dates_RV.year ==
                                       self.dates_RV.year[0]].size
        self.tfreq = (self.dates_all[1] - self.dates_all[0]).days

        def handle_fit_model_dates(dates_RV, dates_all, RV_ts,
                                   fit_model_dates):
            if fit_model_dates is None:
                # RV_ts and RV_ts_fit are equal if fit_model_dates = None
                bool_mask = [
                    True if d in dates_RV else False for d in dates_all
                ]
                fit_model_mask = pd.DataFrame(bool_mask,
                                              columns=['fit_model_mask'],
                                              index=dates_all)
                RV_ts_fit = RV_ts
                fit_dates = dates_RV
            else:
                startperiod, endperiod = fit_model_dates
                startyr = dates_all[0].year
                endyr = dates_all[-1].year
                if dates_all.resolution == 'day':
                    tfreq = (dates_all[1] - dates_all[0]).days
                ex = {
                    'startperiod': startperiod,
                    'endperiod': endperiod,
                    'tfreq': tfreq
                }
                fit_dates = functions_pp.make_RVdatestr(
                    dates_all, ex, startyr, endyr)
                bool_mask = [
                    True if d in fit_dates else False for d in dates_all
                ]
                fit_model_mask = pd.DataFrame(bool_mask,
                                              columns=['fit_model_mask'],
                                              index=dates_all)

                RV_ts_fit = fullts[fit_model_mask.values]
                fit_dates = fit_dates
            return fit_model_mask, fit_dates, RV_ts_fit

        out = handle_fit_model_dates(self.dates_RV, self.dates_all, self.RV_ts,
                                     fit_model_dates)
        self.fit_model_mask, self.fit_dates, self.RV_ts_fit = out

        # make RV_bin for events based on aggregated daymeans
        if kwrgs_events is not None and (type(kwrgs_events) is not tuple
                                         or self.tfreq == 1):

            if type(kwrgs_events) is tuple:
                kwrgs_events = kwrgs_events[1]
            # RV_ts and RV_ts_fit are equal if fit_model_dates = None
            self.threshold = func_fc.Ev_threshold(
                self.RV_ts, kwrgs_events['event_percentile'])
            self.threshold_ts_fit = func_fc.Ev_threshold(
                self.RV_ts_fit, kwrgs_events['event_percentile'])

            # unpack other optional arguments for defining event timeseries
            kwrgs = {
                key: item
                for key, item in kwrgs_events.items()
                if key != 'event_percentile'
            }
            if only_RV_events == True:
                self.RV_bin_fit = func_fc.Ev_timeseries(
                    self.RV_ts_fit, threshold=self.threshold_ts_fit,
                    **kwrgs)[0]
                self.RV_bin = self.RV_bin_fit.loc[self.dates_RV]
            elif only_RV_events == False:
                self.RV_b_full = func_fc.Ev_timeseries(
                    self.fullts, threshold=self.threshold, **kwrgs)[0]
                self.RV_bin = self.RV_b_full.loc[self.dates_RV]

            self.freq = func_fc.get_freq_years(self.RV_bin)

        # make RV_bin for extreme occurring in time window
        if kwrgs_events is not None and type(
                kwrgs_events) is tuple and self.tfreq != 1:

            filename_ts = kwrgs_events[0]
            kwrgs_events_daily = kwrgs_events[1]
            # loading in daily timeseries
            fullts_xr = np.load(filename_ts,
                                encoding='latin1',
                                allow_pickle=True).item()['RVfullts95']

            # Retrieve information on input timeseries
            def aggr_to_daily_dates(dates_precur_data):
                dates = functions_pp.get_oneyr(dates_precur_data)
                tfreq = (dates[1] - dates[0]).days
                start_date = dates[0] - pd.Timedelta(f'{int(tfreq/2)}d')
                end_date = dates[-1] + pd.Timedelta(f'{int(-1+tfreq/2+0.5)}d')
                yr_daily = pd.date_range(start=start_date,
                                         end=end_date,
                                         freq=pd.Timedelta('1d'))
                years = np.unique(dates_precur_data.year)
                ext_dates = functions_pp.make_dates(yr_daily, years)

                return ext_dates

            dates_RVe = aggr_to_daily_dates(self.dates_RV)
            dates_alle = aggr_to_daily_dates(self.dates_all)

            df_RV_ts_e = pd.DataFrame(fullts_xr.sel(time=dates_RVe).values,
                                      index=dates_RVe,
                                      columns=['RV_ts'])

            df_fullts_e = pd.DataFrame(fullts_xr.sel(time=dates_alle).values,
                                       index=dates_alle,
                                       columns=['fullts'])

            out = handle_fit_model_dates(dates_RVe, dates_alle, df_RV_ts_e,
                                         fit_model_dates)
            self.fit_model_mask, self.fit_dates, self.RV_ts_fit_e = out

            # RV_ts and RV_ts_fit are equal if fit_model_dates = None
            self.threshold = func_fc.Ev_threshold(
                df_RV_ts_e, kwrgs_events_daily['event_percentile'])
            self.threshold_ts_fit = func_fc.Ev_threshold(
                self.RV_ts_fit_e, kwrgs_events_daily['event_percentile'])

            if only_RV_events == True:
                # RV_bin_fit is defined such taht we can fit on RV_bin_fit
                # but validate on RV_bin
                self.RV_bin_fit = func_fc.Ev_timeseries(
                    df_RV_ts_e,
                    threshold=self.threshold_ts_fit,
                    min_dur=kwrgs_events_daily['min_dur'],
                    max_break=kwrgs_events_daily['max_break'],
                    grouped=kwrgs_events_daily['grouped'])[0]
                self.RV_bin = self.RV_bin_fit.loc[dates_RVe]
            elif only_RV_events == False:
                self.RV_b_full = func_fc.Ev_timeseries(
                    self.fullts,
                    threshold=self.threshold,
                    min_dur=kwrgs_events_daily['min_dur'],
                    max_break=kwrgs_events_daily['max_break'],
                    grouped=kwrgs_events_daily['grouped'])[0]
                self.RV_bin = self.RV_b_full.loc[self.dates_RV]

            # convert daily binary to window probability binary
            if self.tfreq != 1:
                self.RV_bin, dates_gr = functions_pp.time_mean_bins(
                    self.RV_bin.astype('float'), self.tfreq, None, None)
                self.RV_bin_fit, dates_gr = functions_pp.time_mean_bins(
                    self.RV_bin_fit.astype('float'), self.tfreq, None, None)

            # all bins, with mean > 0 contained an 'extreme' event
            self.RV_bin_fit[self.RV_bin_fit > 0] = 1
            self.RV_bin[self.RV_bin > 0] = 1
Exemplo n.º 12
0
def PDO(filename, ex, df_splits=None):
    #%%
    '''
    PDO is calculated based upon all data points in the training years,
    Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train)
    to enable retrieving the PDO timeseries on a subset on the year.
    It is similarly also projected on the dates_test
    From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices
    See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf
    '''
    t0 = time()
    if df_splits is None:
        RV = ex[ex['RV_name']]
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex)

    kwrgs_pp = {
        'selbox': {
            'la_min': 20,  # select domain in degrees east
            'la_max': 65,
            'lo_min': 115,
            'lo_max': 250
        },
        'format_lon': 'only_east'
    }
    ds = core_pp.import_ds_lazy(filename, **kwrgs_pp)

    to_freq = ex['tfreq']
    if to_freq != 1:
        ds, dates = functions_pp.time_mean_bins(ds,
                                                ex,
                                                to_freq=to_freq,
                                                seldays='all')
        ds['time'] = dates

    dates = pd.to_datetime(ds.time.values)

    splits = df_splits.index.levels[0]
    data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size))
    PDO_patterns = xr.DataArray(
        data,
        coords=[splits, ds.latitude.values, ds.longitude.values],
        dims=['split', 'latitude', 'longitude'])

    def PDO_single_split(s, ds, df_splits, PDO_patterns):
        progress = 100 * (s + 1) / splits.size
        dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s]
                                                      ['TrainIsTrue']].index
        train_yrs = np.unique(dates_train.year)
        dates_all_train = pd.to_datetime(
            [d for d in dates if d.year in train_yrs])
        ###        dates_train_yrs = ###
        dates_test = df_splits.loc[s]['TrainIsTrue'][
            ~df_splits.loc[s]['TrainIsTrue']].index
        n = dates_train.size
        r = int(100 * n / df_splits.loc[s].index.size)
        print(
            f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)",
            end="")

        PDO_pattern, solver, adjust_sign = get_PDO(
            ds.sel(time=dates_all_train))
        data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train),
                                                  PDO_patterns[s])
        data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test),
                                                 PDO_patterns[s])

        df_test = pd.DataFrame(data=data_test.values,
                               index=dates_test,
                               columns=['0_901_PDO'])
        df_train = pd.DataFrame(data=data_train.values,
                                index=dates_train,
                                columns=['0_901_PDO'])

        df = pd.concat([df_test, df_train]).sort_index()
        return (df, PDO_pattern)

    pool = ProcessPoolExecutor(os.cpu_count() - 1)  # amount of cores - 1
    futures = [
        pool.submit(PDO_single_split, s, ds, df_splits, PDO_patterns)
        for s in splits
    ]
    results = [future.result() for future in futures]

    list_splits = [r[0] for r in results]

    time_ = time() - t0
    print(time_ / 60)

    for s in splits:
        PDO_patterns[s] = results[s][1]

    df_PDO = pd.concat(list_splits, axis=0, keys=splits)
    #%%
    return df_PDO, PDO_patterns
Exemplo n.º 13
0
def PDO_temp(filename, ex, df_splits=None):
    #%%
    '''
    PDO is calculated based upon all data points in the training years,
    Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train)
    to enable retrieving the PDO timeseries on a subset on the year.
    It is similarly also projected on the dates_test.
    From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices
    '''

    if df_splits is None:
        RV = ex[ex['RV_name']]
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex)

    kwrgs_pp = {
        'selbox': {
            'la_min': 20,  # select domain in degrees east
            'la_max': 65,
            'lo_min': 115,
            'lo_max': 250
        },
        'format_lon': 'only_east'
    }
    ds = core_pp.import_ds_lazy(filename, **kwrgs_pp)

    to_freq = ex['tfreq']
    if to_freq != 1:
        ds, dates = functions_pp.time_mean_bins(ds,
                                                ex,
                                                to_freq=to_freq,
                                                seldays='all')
        ds['time'] = dates

    dates = pd.to_datetime(ds.time.values)

    splits = df_splits.index.levels[0]
    data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size))
    PDO_patterns = xr.DataArray(
        data,
        coords=[splits, ds.latitude.values, ds.longitude.values],
        dims=['split', 'latitude', 'longitude'])
    list_splits = []
    for s in splits:

        progress = 100 * (s + 1) / splits.size
        dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s]
                                                      ['TrainIsTrue']].index
        train_yrs = np.unique(dates_train.year)
        dates_all_train = pd.to_datetime(
            [d for d in dates if d.year in train_yrs])
        dates_test = df_splits.loc[s]['TrainIsTrue'][
            ~df_splits.loc[s]['TrainIsTrue']].index
        n = dates_train.size
        r = int(100 * n / df_splits.loc[s].index.size)
        print(
            f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)",
            end="")

        PDO_patterns[s], solver, adjust_sign = get_PDO(
            ds.sel(time=dates_all_train))

        PDO_patterns[s] = PDO_patterns[s].interpolate_na(dim='longitude')
        data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train),
                                                  PDO_patterns[s])
        data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test),
                                                 PDO_patterns[s])

        df_test = pd.DataFrame(data=data_test.values,
                               index=dates_test,
                               columns=['0_901_PDO'])
        df_train = pd.DataFrame(data=data_train.values,
                                index=dates_train,
                                columns=['0_901_PDO'])

        df = pd.concat([df_test, df_train]).sort_index()
        list_splits.append(df)

    df_PDO = pd.concat(list_splits, axis=0, keys=splits)
    #%%
    return df_PDO