示例#1
0
def _check_y_fitmask(fit_masks, lag_i, base_lag):
    ''' If lag_i is uneven, taking the mean over the RV period may result in
    a shorter y_fit (RV_mask) then the original RV_mask (where the time mean
    bins were done on its own time axis. Hence y_fit is redefined by adding
    lag_i+base_lag to x_fit mask.

    Note: y_fit_mask and y_pred_mask the same now
    '''
    fit_masks_n = fit_masks.copy()
    y_fit = fit_masks['y_fit']
    x_fit = fit_masks['x_fit']
    y_dates_RV = x_fit[x_fit].index + pd.Timedelta(lag_i + base_lag, 'd')
    y_dates_pr = y_fit[y_fit].index
    mismatch = (functions_pp.get_oneyr(y_dates_pr)[0]- \
                functions_pp.get_oneyr(y_dates_RV)[0] ).days
    y_fit_corr = y_dates_RV + pd.Timedelta(mismatch, 'd')
    y_fit_mask = [True if d in y_fit_corr else False for d in x_fit.index]
    fit_masks_n.loc[:, 'y_fit'] = np.array(y_fit_mask)

    y_pred = fit_masks['y_pred']
    x_pred = fit_masks['x_pred']
    y_dates_RV = x_pred[x_pred].index + pd.Timedelta(lag_i + base_lag, 'd')
    y_dates_pr = y_pred[y_pred].index
    mismatch = (functions_pp.get_oneyr(y_dates_pr)[0]- \
                functions_pp.get_oneyr(y_dates_RV)[0] ).days
    y_pred_corr = y_dates_RV + pd.Timedelta(mismatch, 'd')
    y_pred_mask = [True if d in y_pred_corr else False for d in x_pred.index]
    fit_masks_n.loc[:, 'y_pred'] = np.array(y_pred_mask)
    size_y_fit = fit_masks_n['y_fit'][fit_masks_n['y_fit']].dropna().size
    assert size_y_fit == y_dates_RV.size, ('y_fit mask will not match RV '
                                           ' dates length')
    return fit_masks_n
示例#2
0
def check_NaNs(field, ts):
    '''
    Return shortened timeseries of both field and ts if a few NaNs are detected
    at boundary due to large lag. At boundary time-axis, large lags
    often result in NaNs due to missing data.
    Removing timesteps from timeseries if
    1. Entire field is filled with NaNs
    2. Number of timesteps are less than a single year
       of datapoints.
    '''
    t = functions_pp.get_oneyr(field).size  # threshold NaNs allowed.
    field = np.reshape(field.values, (field.shape[0], -1))
    i = 0
    # check NaNs in first year
    if bool(np.isnan(field[i]).all()):
        i += 1
        while bool(np.isnan(field[i]).all()):
            i += 1
            if i > t:
                raise ValueError('More NaNs detected then # of datapoints in '
                                 'single year')
    j = -1
    # check NaNs in last year
    if bool(np.isnan(field[j]).all()):
        j -= 1
        while bool(np.isnan(field[j]).all()):
            j -= 1
            if j < t:
                raise ValueError('More NaNs detected then # of datapoints in '
                                 'single year')
    else:
        j = field.shape[0]
    return field[i:j], ts[i:j]
示例#3
0
def start_end_date_mean(df_data, start_end_date):

    # create mask to aggregate
    if hasattr(df_data.index, 'levels'):
        pd_dates = df_data.loc[0].index
    else:
        pd_dates = df_data.index
    subset_dates = core_pp.get_subdates(pd_dates , start_end_date)
    dates_to_aggr_mask = pd.Series(np.repeat(False, pd_dates.size), index=pd_dates)
    dates_to_aggr_mask.loc[subset_dates] = True
    if hasattr(df_data.index, 'levels'):
        years = df_data.loc[0][dates_to_aggr_mask].index.year
    else:
        years = df_data[dates_to_aggr_mask].index.year
    index = [functions_pp.get_oneyr(subset_dates, yr).mean() for yr in np.unique(years)]

    if hasattr(df_data.index, 'levels'):
        splits = df_data.index.levels[0]
        df_data_s   = np.zeros( (splits.size) , dtype=object)
        for s in splits:
            df_s = df_data.loc[s]
            df_s = df_s[dates_to_aggr_mask].groupby(years).mean()
            df_s.index = pd.to_datetime(index)
            df_data_s[s] = df_s
        df_data_resample  = pd.concat(list(df_data_s), keys= range(splits.size))
    else:
         df_data_resample = df_data[dates_to_aggr_mask].groupby(years).mean()
         df_data_resample.index = pd.to_datetime(index)
    return df_data_resample
示例#4
0
def pers_ano_to_extr(filename_ts, RV, kwrgs_events_daily, dict_experiments,
                     name_exp, name_model, n_boot):

    # loading in daily timeseries
    RVfullts = np.load(filename_ts, encoding='latin1',
                       allow_pickle=True).item()['RVfullts95']

    # Retrieve information on input timeseries
    import functions_pp
    dates = functions_pp.get_oneyr(RV.RV_ts.index)
    tfreq = (dates[1] - dates[0]).days
    start_date = dates[0] - pd.Timedelta(f'{tfreq/2}d')
    end_date = dates[-1] + pd.Timedelta(f'{-1+tfreq/2}d')
    yr_daily = pd.DatetimeIndex(start=start_date,
                                end=end_date,
                                freq=pd.Timedelta('1d'))
    ext_dates = functions_pp.make_dates(RV.RV_ts.index, yr_daily,
                                        RV.RV_ts.index.year[-1])

    df_RV_ts_e = pd.DataFrame(RVfullts.sel(time=ext_dates).values,
                              index=ext_dates,
                              columns=['RV_ts'])
    df_RVfullts = pd.DataFrame(RVfullts.values,
                               index=pd.to_datetime(RVfullts.time.values),
                               columns=['RVfullts'])

    # Make new class based on new kwrgs_events_daily
    RV_d = func_fc.RV_class(df_RVfullts, df_RV_ts_e, kwrgs_events_daily)
    # Ensure that the bins on the daily time series matches the original
    ex = dict(sstartdate=f'{yr_daily[0].month}-{yr_daily[0].day}',
              senddate=f'{yr_daily[-1].month}-{yr_daily[-1].day}',
              startyear=ext_dates.year[0],
              endyear=ext_dates.year[-1])
    RV_d.RV_bin, dates_gr = functions_pp.time_mean_bins(RV_d.RV_bin, ex, tfreq)
    RV_d.RV_bin[RV_d.RV_bin > 0] = 1
    RV_d.TrainIsTrue = RV.TrainIsTrue
    RV_d.RV_mask = RV.RV_mask
    # add new probability of event occurence
    RV_d.prob_clim = func_fc.get_obs_clim(RV_d)

    dict_comparison = {}
    # loading model predicting pers. anomalies
    orig_event_perc = np.round(1 - float(RV.prob_clim.mean()), 2)
    new_name = '{}d mean +{}p to +{}p events'.format(
        tfreq, orig_event_perc, kwrgs_events_daily['event_percentile'])

    dict_sum = dict_experiments[name_exp]
    df_valid, RV, y_pred = dict_sum[models[-1]]

    blocksize = valid.get_bstrap_size(RV.RVfullts, plot=False)
    out = valid.get_metrics_sklearn(RV_d,
                                    y_pred,
                                    RV_d.prob_clim,
                                    n_boot=n_boot,
                                    blocksize=blocksize)
    df_valid, metrics_dict = out
    dict_comparison[new_name] = {name_model: (df_valid, RV_d, y_pred)}
    return dict_comparison
示例#5
0
            def aggr_to_daily_dates(dates_precur_data):
                dates = functions_pp.get_oneyr(dates_precur_data)
                tfreq = (dates[1] - dates[0]).days
                start_date = dates[0] - pd.Timedelta(f'{int(tfreq/2)}d')
                end_date = dates[-1] + pd.Timedelta(f'{int(-1+tfreq/2+0.5)}d')
                yr_daily = pd.date_range(start=start_date,
                                         end=end_date,
                                         freq=pd.Timedelta('1d'))
                years = np.unique(dates_precur_data.year)
                ext_dates = functions_pp.make_dates(yr_daily, years)

                return ext_dates
示例#6
0
def pp_calc_ts(precur, precur_aggr=None, kwrgs_load: dict=None,
                      force_reload: bool=False, lags: list=None):
    '''
    Pre-process for calculating timeseries of precursor regions or pattern.
    '''
    #%%
    corr_xr         = precur.corr_xr
    prec_labels     = precur.prec_labels

    if lags is not None:
        lags        = np.array(lags) # ensure lag is np.ndarray
        corr_xr     = corr_xr.sel(lag=lags).copy()
        prec_labels = prec_labels.sel(lag=lags).copy()
    else:
        lags        = prec_labels.lag.values
    dates           = pd.to_datetime(precur.precur_arr.time.values)
    oneyr = functions_pp.get_oneyr(dates)
    if oneyr.size == 1: # single val per year precursor
        tfreq = 365
    else:
        tfreq = (oneyr[1] - oneyr[0]).days


    if precur_aggr is None and force_reload==False:
        precur_arr = precur.precur_arr
    else:
        if precur_aggr is not None:
            precur.tfreq = precur_aggr
        precur.load_and_aggregate_precur(kwrgs_load.copy())
        precur_arr = precur.precur_arr

    if type(precur.lags[0]) is np.ndarray and precur_aggr is None:
        precur.period_means_array = True
    else:
        precur.period_means_array = False

    if precur_arr.shape[-2:] != corr_xr.shape[-2:]:
        print('shape loaded precur_arr != corr map, matching coords')
        corr_xr, prec_labels = functions_pp.match_coords_xarrays(precur_arr,
                                          *[corr_xr, prec_labels])
    #%%
    return precur_arr, corr_xr, prec_labels
示例#7
0
def loop_get_spatcov(precur,
                     precur_aggr=None,
                     kwrgs_load: dict = None,
                     force_reload: bool = False,
                     lags: list = None):

    name = precur.name
    use_sign_pattern = precur.use_sign_pattern
    corr_xr = precur.corr_xr
    prec_labels = precur.prec_labels
    splits = corr_xr.split
    if lags is not None:
        lags = np.array(lags)  # ensure lag is np.ndarray
        corr_xr = corr_xr.sel(lag=lags).copy()
        prec_labels = prec_labels.sel(lag=lags).copy()
    else:
        lags = prec_labels.lag.values
    dates = pd.to_datetime(precur.precur_arr.time.values)
    oneyr = functions_pp.get_oneyr(dates)
    if oneyr.size == 1:  # single val per year precursor
        tfreq = 365
    else:
        tfreq = (oneyr[1] - oneyr[0]).days

    if precur_aggr is None and force_reload == False:
        precur_arr = precur.precur_arr
        if tfreq == 365:
            precur_arr = precur.precur_arr
        # use precursor array with temporal aggregation that was used to create
        # correlation map. When tfreq=365, aggregation (one-value-per-year)
        # is already done. period used to aggregate was defined by the lag

    else:
        if precur_aggr is not None:
            precur.tfreq = precur_aggr
        precur.load_and_aggregate_precur(kwrgs_load.copy())
        precur_arr = precur.precur_arr

    precur.area_grid = find_precursors.get_area(precur_arr)
    if precur_arr.shape[-2:] != corr_xr.shape[-2:]:
        print('shape loaded precur_arr != corr map, matching coords')
        corr_xr, prec_labels = functions_pp.match_coords_xarrays(
            precur_arr, *[corr_xr, prec_labels])

    ts_sp = np.zeros((splits.size), dtype=object)
    for s in splits:
        ts_list = np.zeros((lags.size), dtype=list)
        track_names = []
        for il, lag in enumerate(lags):

            # if lag represents aggregation period:
            if type(precur.lags[il]) is np.ndarray and precur_aggr is None:
                precur_arr = precur.precur_arr.sel(lag=il)

            corr_vals = corr_xr.sel(split=s).isel(lag=il)
            mask = prec_labels.sel(split=s).isel(lag=il)
            pattern = corr_vals.where(~np.isnan(mask))
            if use_sign_pattern == True:
                pattern = np.sign(pattern)
            if np.isnan(pattern.values).all():
                # no regions of this variable and split
                nants = np.zeros((precur_arr.time.size, 1))
                nants[:] = np.nan
                ts_list[il] = nants
                pass
            else:
                # if normalize == True:
                #     spatcov_full = calc_spatcov(full_timeserie, pattern)
                #     mean = spatcov_full.sel(time=dates_train).mean(dim='time')
                #     std = spatcov_full.sel(time=dates_train).std(dim='time')
                #     spatcov_test = ((spatcov_full - mean) / std)
                # elif normalize == False:
                xrts = find_precursors.calc_spatcov(precur_arr, pattern)
                ts_list[il] = xrts.values[:, None]
            track_names.append(f'{lag}..0..{precur.name}' + '_sp')

        # concatenate timeseries all of lags
        tsCorr = np.concatenate(tuple(ts_list), axis=1)

        dates = pd.to_datetime(precur_arr.time.values)
        ts_sp[s] = pd.DataFrame(tsCorr, index=dates, columns=track_names)
    # df_sp = pd.concat(list(ts_sp), keys=range(splits.size))
    return ts_sp
示例#8
0
    def bivariateMI_map(self, precur_arr, df_splits, RV):  #
        #%%
        # precur_arr = self.precur_arr ; df_splits = rg.df_splits ; RV = rg.TV
        """
        This function calculates the correlation maps for precur_arr for different lags.
        Field significance is applied to test for correltion.
        RV_period: indices that matches the response variable time series
        alpha: significance level

        A land sea mask is assumed from settin all the nan value to True (masked).
        For xrcorr['mask'], all gridcell which are significant are not masked,
        i.e. bool == False
        """

        if type(self.lags) is np.ndarray and type(
                self.lags[0]) is not np.ndarray:
            self.lags = np.array(self.lags, dtype=np.int16)  # fix dtype
            self.lag_coordname = self.lags
        else:
            self.lag_coordname = np.arange(len(self.lags))  # for period_means
        n_lags = len(self.lags)
        lags = self.lags
        self.df_splits = df_splits  # add df_splits to self
        dates = self.df_splits.loc[0].index

        targetstepsoneyr = functions_pp.get_oneyr(RV.RV_ts)
        if type(self.lags[0]) == np.ndarray and targetstepsoneyr.size > 1:
            raise ValueError(
                'Precursor and Target do not align.\n'
                'One aggregated value taken for months '
                f'{self.lags[0]}, while target timeseries has '
                f'multiple timesteps per year:\n{targetstepsoneyr}')
        yrs_precur_arr = np.unique(precur_arr.time.dt.year)
        if np.unique(dates.year).size != yrs_precur_arr.size:
            raise ValueError(
                'Numer of years between precursor and Target '
                'not match. Check if precursor period is crossyr, '
                'while target period is not. '
                'Mannually ensure start_end_year is aligned.')

        oneyr = functions_pp.get_oneyr(dates)
        if oneyr.size == 1:  # single val per year precursor
            self._tfreq = 365
        else:
            self._tfreq = (oneyr[1] - oneyr[0]).days

        n_spl = df_splits.index.levels[0].size
        # make new xarray to store results
        xrcorr = precur_arr.isel(time=0).drop('time').copy()
        orig_mask = np.isnan(precur_arr[1])
        if 'lag' not in xrcorr.dims:
            # add lags
            list_xr = [
                xrcorr.expand_dims('lag', axis=0) for i in range(n_lags)
            ]
            xrcorr = xr.concat(list_xr, dim='lag')
            xrcorr['lag'] = ('lag', self.lag_coordname)
        # add train test split
        list_xr = [xrcorr.expand_dims('split', axis=0) for i in range(n_spl)]
        xrcorr = xr.concat(list_xr, dim='split')
        xrcorr['split'] = ('split', range(n_spl))
        xrpvals = xrcorr.copy()

        def MI_single_split(RV_ts,
                            precur_train,
                            s,
                            alpha=.05,
                            FDR_control=True):

            lat = precur_train.latitude.values
            lon = precur_train.longitude.values

            z = np.zeros((lat.size * lon.size, len(lags)))
            Corr_Coeff = np.ma.array(z, mask=z)
            pvals = np.ones((lat.size * lon.size, len(lags)))

            dates_RV = RV_ts.index
            for i, lag in enumerate(lags):
                if type(lag) is np.int16 and self.lag_as_gap == False:
                    # dates_lag = functions_pp.func_dates_min_lag(dates_RV, self._tfreq*lag)[1]
                    m = apply_shift_lag(self.df_splits.loc[s], lag)
                    dates_lag = m[np.logical_and(m['TrainIsTrue'],
                                                 m['x_fit'])].index
                    corr_val, pval = self.func(
                        precur_train.sel(time=dates_lag),
                        RV_ts.values.squeeze(), **self.kwrgs_func)
                elif type(lag) == np.int16 and self.lag_as_gap == True:
                    # if only shift tfreq, then gap=0
                    datesdaily = RV.aggr_to_daily_dates(dates_RV,
                                                        tfreq=self._tfreq)
                    dates_lag = functions_pp.func_dates_min_lag(
                        datesdaily, self._tfreq + lag)[1]

                    tmb = functions_pp.time_mean_bins
                    corr_val, pval = self.func(
                        tmb(precur_train.sel(time=dates_lag),
                            to_freq=self._tfreq)[0], RV_ts.values.squeeze(),
                        **self.kwrgs_func)
                elif type(lag) == np.ndarray:
                    corr_val, pval = self.func(precur_train.sel(lag=i),
                                               RV_ts.values.squeeze(),
                                               **self.kwrgs_func)

                mask = np.ones(corr_val.size, dtype=bool)
                if FDR_control == True:
                    # test for Field significance and mask unsignificant values
                    # FDR control:
                    adjusted_pvalues = multicomp.multipletests(pval,
                                                               method='fdr_bh')
                    ad_p = adjusted_pvalues[1]
                    pvals[:, i] = ad_p
                    mask[ad_p <= alpha] = False

                else:
                    pvals[:, i] = pval
                    mask[pval <= alpha] = False

                Corr_Coeff[:, i] = corr_val[:]
                Corr_Coeff[:, i].mask = mask

            Corr_Coeff = np.ma.array(data=Corr_Coeff[:, :],
                                     mask=Corr_Coeff.mask[:, :])
            Corr_Coeff = Corr_Coeff.reshape(lat.size, lon.size,
                                            len(lags)).swapaxes(2, 1).swapaxes(
                                                1, 0)
            pvals = pvals.reshape(lat.size, lon.size,
                                  len(lags)).swapaxes(2, 1).swapaxes(1, 0)
            return Corr_Coeff, pvals

        print('\n{} - calculating correlation maps'.format(precur_arr.name))
        np_data = np.zeros_like(xrcorr.values)
        np_mask = np.zeros_like(xrcorr.values)
        np_pvals = np.zeros_like(xrcorr.values)
        RV_mask = df_splits.loc[0]['RV_mask']
        for s in xrcorr.split.values:
            progress = int(100 * (s + 1) / n_spl)
            # =============================================================================
            # Split train test methods ['random'k'fold', 'leave_'k'_out', ', 'no_train_test_split']
            # =============================================================================
            RV_train_mask = np.logical_and(RV_mask,
                                           df_splits.loc[s]['TrainIsTrue'])
            RV_ts = RV.fullts[RV_train_mask.values]
            TrainIsTrue = df_splits.loc[s]['TrainIsTrue'].values
            if self.lag_as_gap:  # no clue why selecting all datapoints, changed 26-01-2021
                train_dates = df_splits.loc[s]['TrainIsTrue'][
                    TrainIsTrue].index
                precur_train = precur_arr.sel(time=train_dates)
            else:
                precur_train = precur_arr[TrainIsTrue]  # only train data

            dates_RV = RV_ts.index
            n = dates_RV.size
            r = int(100 * n / RV.dates_RV.size)
            print(
                f"\rProgress traintest set {progress}%, trainsize=({n}dp, {r}%)",
                end="")

            ma_data, pvals = MI_single_split(RV_ts,
                                             precur_train.copy(),
                                             s,
                                             alpha=self.alpha,
                                             FDR_control=self.FDR_control)

            np_data[s] = ma_data.data
            np_mask[s] = ma_data.mask
            np_pvals[s] = pvals
        print("\n")
        xrcorr.values = np_data
        xrpvals.values = np_pvals
        mask = (('split', 'lag', 'latitude', 'longitude'), np_mask)
        xrcorr.coords['mask'] = mask
        # fill nans with mask = True
        xrcorr['mask'] = xrcorr['mask'].where(orig_mask == False,
                                              other=orig_mask).drop('time')
        #%%
        return xrcorr, xrpvals
示例#9
0
from func_models import standardize_on_train
# summerdates = core_pp.get_subdates(dates, start_end_TVdate)
df_PDOsplit = df_PDO.loc[0]#.loc[summerdates]
# standardize = preprocessing.StandardScaler()
# standardize.fit(df_PDOsplit[df_PDOsplit['TrainIsTrue'].values].values.reshape(-1,1))
# df_PDOsplit = pd.DataFrame(standardize.transform(df_PDOsplit['PDO'].values.reshape(-1,1)),
#                 index=df_PDOsplit.index, columns=['PDO'])
df_PDOsplit = df_PDOsplit[['PDO']].apply(standardize_on_train,
                         args=[df_PDO.loc[0]['TrainIsTrue']],
                         result_type='broadcast')

# Butter Lowpass
yr = 2
dates = df_PDOsplit.index
freqraw = (dates[1] - dates[0]).days
window = int(yr*functions_pp.get_oneyr(dates).size) # 2 year
fig, ax = plt.subplots(1,1)

ax.plot_date(dates, df_PDOsplit.values, label=f'raw ({freqraw} daymeans)',
              alpha=.2, linestyle='solid', marker=None)
ax.plot_date(dates, filters.lowpass(df_PDOsplit, period=window), label='Butterworth',
        linestyle='solid', linewidth=1, marker=None)
df_PDOrm = df_PDOsplit.rolling(window=window, center=True, min_periods=1).mean()
# ax.plot_date(dates, filters.lowpass(df_PDOrm, period=window), label='Butterworth on rolling mean',
#         linestyle='solid', linewidth=1, marker=None)
ax.plot_date(dates, df_PDOrm,
             label='rolling mean', color='green', linestyle='solid', linewidth=1, marker=None)

ax.legend()

示例#10
0
                                       start_end_date=rg.start_end_TVdate)
        RV_ts = rg.fulltso.sel(time=dates_RV)
        ds_v300 = core_pp.import_ds_lazy(rg.list_precur_pp[1][1])
        dslocal = core_pp.get_selbox(ds_v300, selbox=selbox)



        datesRW = core_pp.get_subdates(pd.to_datetime(dslocal.time.values),
                                       start_end_date=rg.start_end_TVdate)
        datesRW = datesRW + pd.Timedelta(f'{lag}d')
        dslocal = dslocal.sel(time=datesRW)

        wv6local = core_pp.get_selbox(xarray.sel(lag=5), selbox=selbox)
        patternlocal = wv6local.mean(dim='lag')
        ts = find_precursors.calc_spatcov(dslocal, patternlocal)
        ts_15, d = functions_pp.time_mean_bins(ts, tfreq, start_end_date=start_end_TVdate,
                                                   closed_on_date=start_end_TVdate[-1])
        RV_15, d = functions_pp.time_mean_bins(RV_ts, tfreq, start_end_date=start_end_TVdate,
                                                   closed_on_date=start_end_TVdate[-1])
        corr_value = np.corrcoef(ts_15.values.squeeze(), RV_15.values.squeeze())[0][1]
        print('corr: {:.2f}'.format(corr_value))
        values.append(corr_value)
    plt.plot(range(-9,10), values[1:])
    # df_wv6 = ts_15.to_dataframe(name='wv6p2')
#%%
sst = rg.list_for_MI[2]

dates_years = functions_pp.get_oneyr(sst.df_splits.loc[0].index, *event_dates.year)
sst.precur_arr.sel(time=dates_years).mean(dim='time').plot(vmin=-.3, vmax=.3,
                                                           cmap=plt.cm.RdBu_r)
示例#11
0
def spatial_mean_regions(precur,
                         precur_aggr=None,
                         kwrgs_load: dict = None,
                         force_reload: bool = False,
                         lags: list = None):
    '''
    Wrapper for calculating 1-d spatial mean timeseries per precursor region.

    Parameters
    ----------
    precur : class_BivariateMI instance
    precur_aggr : int, optional
        If None, same precur_arr is used as for the correlation maps.
    kwrgs_load : dict, optional
        kwrgs to load in timeseries. See functions_pp.import_ds_timemeanbins or
        functions_pp.time_mean_period. The default is None.
    force_reload : bool, optional
        Force reload a different precursor array (precur_arr). The default is
        False.

    Returns
    -------
    ts_corr : TYPE
        DESCRIPTION.

    '''
    #%%

    name = precur.name
    corr_xr = precur.corr_xr
    prec_labels = precur.prec_labels
    n_spl = corr_xr.split.size
    use_coef_wghts = precur.use_coef_wghts
    if lags is not None:
        lags = np.array(lags)  # ensure lag is np.ndarray
        corr_xr = corr_xr.sel(lag=lags).copy()
        prec_labels = prec_labels.sel(lag=lags).copy()
    else:
        lags = prec_labels.lag.values
    dates = pd.to_datetime(precur.precur_arr.time.values)
    oneyr = functions_pp.get_oneyr(dates)
    if oneyr.size == 1:  # single val per year precursor
        tfreq = 365
    else:
        tfreq = (oneyr[1] - oneyr[0]).days

    if precur_aggr is None and force_reload == False:
        precur_arr = precur.precur_arr
        if tfreq == 365:
            precur_arr = precur.precur_arr
        # use precursor array with temporal aggregation that was used to create
        # correlation map. When tfreq=365, aggregation (one-value-per-year)
        # is already done. period used to aggregate was defined by the lag

    else:
        if precur_aggr is not None:
            precur.tfreq = precur_aggr
        precur.load_and_aggregate_precur(kwrgs_load.copy())
        precur_arr = precur.precur_arr

    precur.area_grid = get_area(precur_arr)
    if precur_arr.shape[-2:] != corr_xr.shape[-2:]:
        print('shape loaded precur_arr != corr map, matching coords')
        corr_xr, prec_labels = functions_pp.match_coords_xarrays(
            precur_arr, *[corr_xr, prec_labels])

    ts_corr = np.zeros((n_spl), dtype=object)
    for s in range(n_spl):
        corr = corr_xr.isel(split=s)
        labels = prec_labels.isel(split=s)

        ts_list = np.zeros((lags.size), dtype=list)
        track_names = []
        for l_idx, lag in enumerate(lags):
            labels_lag = labels.sel(lag=lag).values

            # if lag represents aggregation period:
            if type(precur.lags[l_idx]) is np.ndarray and precur_aggr is None:
                precur_arr = precur.precur_arr.sel(lag=l_idx)

            regions_for_ts = list(np.unique(labels_lag[~np.isnan(labels_lag)]))
            a_wghts = precur.area_grid / precur.area_grid.mean()
            if use_coef_wghts:
                coef_wghts = abs(corr.sel(lag=lag)) / abs(
                    corr.sel(lag=lag)).max()
                a_wghts *= coef_wghts.values  # area & corr. value weighted

            # this array will be the time series for each feature
            ts_regions_lag_i = np.zeros(
                (precur_arr.values.shape[0], len(regions_for_ts)))

            # track sign of eacht region
            sign_ts_regions = np.zeros(len(regions_for_ts))

            # calculate area-weighted mean over features
            for r in regions_for_ts:

                idx = regions_for_ts.index(r)
                # start with empty lonlat array
                B = np.zeros(labels_lag.shape)
                # Mask everything except region of interest
                B[labels_lag == r] = 1
                #        # Calculates how values inside region vary over time, wgts vs anomaly
                #        wgts_ano = meanbox[B==1] / meanbox[B==1].max()
                #        ts_regions_lag_i[:,idx] = np.nanmean(actbox[:,B==1] * cos_box_array[:,B==1] * wgts_ano, axis =1)
                # Calculates how values inside region vary over time
                ts = np.nanmean(precur_arr.values[:, B == 1] * a_wghts[B == 1],
                                axis=1)

                # check for nans
                if ts[np.isnan(ts)].size != 0:
                    print(ts)
                    perc_nans = ts[np.isnan(ts)].size / ts.size
                    if perc_nans == 1:
                        # all NaNs
                        print(f'All timesteps were NaNs split {s}'
                              f' for region {r} at lag {lag}')

                    else:
                        print(f'{perc_nans} NaNs split {s}'
                              f' for region {r} at lag {lag}')

                track_names.append(f'{lag}..{int(r)}..{name}')

                ts_regions_lag_i[:, idx] = ts
                # get sign of region
                sign_ts_regions[idx] = np.sign(
                    np.mean(corr.isel(lag=l_idx).values[B == 1]))

            ts_list[l_idx] = ts_regions_lag_i

        dates = pd.to_datetime(precur_arr.time.values)
        tsCorr = np.concatenate(tuple(ts_list), axis=1)
        df_tscorr = pd.DataFrame(tsCorr, index=dates, columns=track_names)
        df_tscorr.name = str(s)
        ts_corr[s] = df_tscorr
    if any(df_tscorr.isna().values.flatten()):
        print('Warnning: nans detected')
    #%%
    return ts_corr
    print('\n', month, '\n')
    keys = [k for k in rg.df_data.columns[:-2] if k not in [rg.TV.name, 'PDO']]
    if target == 'easterntemp':
        keys = [k for k in keys if int(k.split('..')[1]) in [1, 2]]
    if remove_PDO:
        y_keys = [k for k in keys if 'sst' in k]
        rg.df_data[y_keys], fig = wPCMCI.df_data_remove_z(rg.df_data,
                                                          z=['PDO'],
                                                          keys=y_keys,
                                                          standardize=False)
        fig_path = os.path.join(rg.path_outsub1,
                                f'regressing_out_PDO_tf{month}')
        fig.savefig(fig_path + rg.figext, bbox_inches='tight')

    if any(rg._df_count == rg.n_spl):  # at least one timeseries always present
        oneyr = functions_pp.get_oneyr(
            rg.df_data['RV_mask'].loc[0][rg.df_data['RV_mask'].loc[0]])
        oneyrsize = oneyr.size
        if monthkeys.index(month) >= 1:
            nextyr = functions_pp.get_oneyr(
                rg.df_data['RV_mask'].loc[0][rg.df_data['RV_mask'].loc[0]])
            if nextyr.size != oneyrsize:
                raise ValueError

        fc_mask = rg.df_data.iloc[:, -1].loc[0]  #.shift(lag, fill_value=False)
        # rg.df_data = rg._replace_RV_mask(rg.df_data, replace_RV_mask=(fc_mask))
        target_ts = rg.df_data.iloc[:, [0]].loc[0][fc_mask]
        target_ts = (target_ts - target_ts.mean()) / target_ts.std()

        # ScikitModel = scikitlinear.LassoCV

        out_fit = rg.fit_df_data_ridge(target=target_ts,
示例#13
0
plt.ylabel('frequency (1/year)')
RV.freq_per_year.plot(kind='bar')
fname = 'freq_per_year.png'
filename = os.path.join(ex['path_fig'], fname)
plt.savefig(filename)

#%% get timeseries:

#ERA5_filename = 'era5_t2mmax_US_1979-2018_averAggljacc0.25d_tf1_n4__to_t2mmax_US_tf1_selclus4_okt19.npy'
GHCND_filename = "PEP-T95TimeSeries.txt"

RV, ex = load_data.load_response_variable(ex)
T95_ERA5 = RV.RV_ts
ex['RV1d_ts_path'] = '/Users/semvijverberg/surfdrive/MckinRepl/RVts'
T95_GHCND, GHCND_dates = load_data.read_T95(GHCND_filename, ex)
dates = functions_pp.get_oneyr(RV.dates_RV, 2012)
shared_dates = functions_pp.get_oneyr(RV.dates_RV, *list(range(1982, 2016)))
#%%
data = np.stack([
    T95_GHCND.sel(time=shared_dates).values,
    T95_ERA5.loc[shared_dates].values.squeeze()
],
                axis=1)
df = pd.DataFrame(data, columns=['GHCND', 'ERA-5'], index=shared_dates)

dfplots.plot_oneyr_events(df, 'std', 2012)
plt.savefig(os.path.join(ex['path_fig'], 'timeseries_ERA5_GHCND.png'),
            bbox_inches='tight')

#%% Weighing features if there are extracted every run (training set)
# weighted by persistence of pattern over
示例#14
0
文件: df_ana.py 项目: VU-IVM/RGCPD
def plot_ts_matric(df_init, win: int=None, lag=0, columns: list=None, rename: dict=None,
                   period='fullyear', plot_sign_stars=True, fontsizescaler=0):
    #%%
    '''
    period = ['fullyear', 'summer60days', 'pre60days']
    '''
    if columns is None:
        columns = list(df_init.columns[(df_init.dtypes != bool).values])


    df_cols = df_init[columns]


    if hasattr(df_init.index, 'levels'):
        splits = df_init.index.levels[0]
        print('extracting RV dates from test set')
        dates_RV_orig   = df_init.loc[0].index[df_init.loc[0]['RV_mask']==True]
        TrainIsTrue = df_init['TrainIsTrue']
        dates_full_orig = df_init.loc[0].index
        list_test = []
        for s in range(splits.size):
            TestIsTrue = TrainIsTrue[s]==False
            list_test.append(df_cols.loc[s][TestIsTrue])
        df_test = pd.concat(list_test).sort_index()
    else:
        df_test = df_init
        dates_full_orig = df_init.index

    if lag != 0:
        # shift precursor vs. tmax
        for c in df_test.columns[1:]:
            df_test[c] = df_test[c].shift(periods=-lag)

    # bin means
    if win is not None:
        oneyr = get_oneyr(df_test.index)
        start_end_date = (f'{oneyr[0].month:02d}-{oneyr[0].day:02d}',
                          f'{oneyr[-1].month:02d}-{oneyr[-1].day:02d}')
        df_test = time_mean_bins(df_test, win, start_end_date=start_end_date)[0]


    if period=='fullyear':
        dates_sel = dates_full_orig.strftime('%Y-%m-%d')
    if 'RV_mask' in df_init.columns:
        if period == 'RV_mask':
            dates_sel = dates_RV_orig.strftime('%Y-%m-%d')
        elif period == 'RM_mask_lag60':
            dates_sel = (dates_RV_orig - pd.Timedelta(60, unit='d')).strftime('%Y-%m-%d')

    # after resampling, not all dates are in their:
    dates_sel =  pd.to_datetime([d for d in dates_sel if d in df_test.index] )
    df_period = df_test.loc[dates_sel, :].dropna()

    if rename is not None:
        df_period = df_period.rename(rename, axis=1)

    corr, sig_mask, pvals = corr_matrix_pval(df_period, alpha=0.01)

    # Generate a mask for the upper triangle
    mask_tri = np.zeros_like(corr, dtype=np.bool)
    mask_tri[np.triu_indices_from(mask_tri)] = True
    mask_sig = mask_tri.copy()
    mask_sig[sig_mask==False] = True

    # removing meaningless row and column
    cols = corr.columns
    corr = corr.drop(cols[0], axis=0).drop(cols[-1], axis=1)
    mask_sig = mask_sig[1:, :-1]
    mask_tri = mask_tri[1:, :-1]
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(10, 10))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, n=9, l=30, as_cmap=True)

    ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1E99, center=0,
                square=True, linewidths=.5,
                 annot=False, annot_kws={'size':30+fontsizescaler}, cbar=False)

    if plot_sign_stars:
        sig_bold_labels = sig_bold_annot(corr, mask_sig)
    else:
        sig_bold_labels = corr.round(2).astype(str).values
    # Draw the heatmap with the mask and correct aspect ratio
    ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .8},
                 annot=sig_bold_labels, annot_kws={'size':30+fontsizescaler}, cbar=False, fmt='s')

    ax.tick_params(axis='both', labelsize=15+fontsizescaler,
                   bottom=True, top=False, left=True, right=False,
                   labelbottom=True, labeltop=False, labelleft=True, labelright=False)

    ax.set_xticklabels(corr.columns, fontdict={'fontweight':'bold',
                                               'fontsize':20+fontsizescaler})
    ax.set_yticklabels(corr.index, fontdict={'fontweight':'bold',
                                               'fontsize':20+fontsizescaler}, rotation=0)
    #%%
    return
示例#15
0
文件: df_ana.py 项目: VU-IVM/RGCPD
def plot_timeseries(y, timesteps: list=None,
                    selyears: Union[list, int]=None, title=None,
                    legend: bool=True, nth_xyear: int=10, ax=None):
    # ax=None
    #%%


    if hasattr(y.index,'levels'):
        y_ac = y.loc[0]
    else:
        y_ac = y

    if type(y_ac.index) == pd.core.indexes.datetimes.DatetimeIndex:
        datetimes = y_ac.index

    if timesteps is None and selyears is None:
        ac, con_int = autocorr_sm(y_ac)
        where = np.where(con_int[:,0] < 0 )[0]
        # has to be below 0 for n times (not necessarily consecutive):
        n = 1
        n_of_times = np.array([idx+1 - where[0] for idx in where])
        if n_of_times.size != 0:
            cutoff = where[np.where(n_of_times == n)[0][0] ]
        else:
            cutoff = 100

        timesteps = min(y_ac.index.size, 10*cutoff)
        datetimes = y_ac.iloc[:timesteps].index

    if selyears is not None and timesteps is None:
        if type(selyears) is int:
            selyears = [selyears]
        datetimes = get_oneyr(y.index, *selyears)

    if timesteps is not None and selyears is None:
        datetimes = datetimes[:timesteps]

    if ax is None:
        fig, ax = plt.subplots(constrained_layout=True)

    if hasattr(y.index,'levels'):
        for fold in y.index.levels[0]:
            if legend:
                label = f'f {fold+1}' ; color = None ; alpha=.5
            else:
                label = None ; color = 'red' ; alpha=.1
            ax.plot(datetimes, y.loc[fold, datetimes], alpha=alpha,
                    label=label, color=color)
        if legend:
            ax.legend(prop={'size':6})
    else:
        ax.plot(datetimes, y.loc[datetimes])

    if nth_xyear is None:
        nth_xtick = round(len(ax.xaxis.get_ticklabels())/5)
        for n, label in enumerate(ax.xaxis.get_ticklabels()):
            if n % nth_xtick != 0:
                label.set_visible(False)
    else:
        ax.xaxis.set_major_locator(mdates.YearLocator(1)) # set tick every year
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y')) # format %Y
        for n, label in enumerate(ax.xaxis.get_ticklabels()):
            if n % nth_xyear != 0:
                label.set_visible(False)

    ax.tick_params(axis='both', which='major', labelsize=8)
    if title is not None:
        ax.set_title(title, fontsize=10)
    #%%
    return ax
示例#16
0
            # standardize.fit(df_PDOsplit[df_PDOsplit['TrainIsTrue'].values].values.reshape(-1,1))
            # df_PDOsplit = pd.DataFrame(standardize.transform(df_PDOsplit['PDO'].values.reshape(-1,1)),
            #                 index=df_PDOsplit.index, columns=['PDO'])
        df_PDOsplit = df_PDOsplit[['PDO']].apply(standardize_on_train,
                             args=[df_PDO.loc[0]['TrainIsTrue']],
                             result_type='broadcast')

        # Butter Lowpass
        dates = df_PDOsplit.index
        freqraw = (dates[1] - dates[0]).days
        ls = ['solid', 'dotted', 'dashdot', 'dashed']
        fig, ax = plt.subplots(1,1)
        list_dfPDO = [df_PDOsplit]
        lowpass_yrs = [.25, .5, 1.0, 2.0]
        for i, yr in enumerate(lowpass_yrs):
            window = int(yr*functions_pp.get_oneyr(dates).size) # 2 year
            if i ==0:
                ax.plot_date(dates, df_PDOsplit.values, label=f'Raw ({freqraw} day means)',
                          alpha=.3, linestyle='solid', marker=None)
            df_PDObw = pd.Series(filters.lowpass(df_PDOsplit, period=window).squeeze(),
                                 index=dates, name=f'PDO{yr}bw')
            ax.plot_date(dates, df_PDObw, label=f'Butterworth {yr}-year low-pass',
                    color='red',linestyle=ls[i], linewidth=1, marker=None)
            df_PDOrm = df_PDOsplit.rolling(window=window, closed='right', min_periods=window).mean()
            df_PDOrm = df_PDOrm.rename({'PDO':f'PDO{yr}rm'}, axis=1)
            ax.plot_date(dates, df_PDOrm,
                         label=f'Rolling mean {yr}-year low-pass (closed right)', color='green',linestyle=ls[i],
                         linewidth=1, marker=None)
            list_dfPDO.append(df_PDObw) ; list_dfPDO.append(df_PDOrm)
            ax.legend()