예제 #1
0
        def MI_single_split(RV_ts, precur_train, s, alpha=.05, FDR_control=True):


            lat = precur_train.latitude.values
            lon = precur_train.longitude.values

            z = np.zeros((lat.size*lon.size,len(lags) ) )
            Corr_Coeff = np.ma.array(z, mask=z)
            pvals = np.ones((lat.size*lon.size,len(lags) ) )

            dates_RV = RV_ts.index
            for i, lag in enumerate(lags):
                if type(lag) is np.int16 and self.lag_as_gap==False:
                    # dates_lag = functions_pp.func_dates_min_lag(dates_RV, self._tfreq*lag)[1]
                    m = apply_shift_lag(self.df_splits.loc[s], lag)
                    dates_lag = m[np.logical_and(m['TrainIsTrue']==1, m['x_fit'])].index
                    corr_val, pval = self.func(precur_train.sel(time=dates_lag),
                                               RV_ts.values.squeeze(),
                                               **self.kwrgs_func)
                elif type(lag) == np.int16 and self.lag_as_gap==True:
                    # if only shift tfreq, then gap=0
                    datesdaily = RV_class.aggr_to_daily_dates(dates_RV, tfreq=self._tfreq)
                    dates_lag = functions_pp.func_dates_min_lag(datesdaily,
                                                                self._tfreq+lag)[1]

                    tmb = functions_pp.time_mean_bins
                    corr_val, pval = self.func(tmb(precur_train.sel(time=dates_lag),
                                                           to_freq=self._tfreq)[0],
                                               RV_ts.values.squeeze(),
                                               **self.kwrgs_func)
                elif type(lag) == np.ndarray:
                    corr_val, pval = self.func(precur_train.sel(lag=i),
                                               RV_ts.values.squeeze(),
                                               **self.kwrgs_func)



                mask = np.ones(corr_val.size, dtype=bool)
                if FDR_control == True:
                    # test for Field significance and mask unsignificant values
                    # FDR control:
                    adjusted_pvalues = multicomp.multipletests(pval, method='fdr_bh')
                    ad_p = adjusted_pvalues[1]
                    pvals[:,i] = ad_p
                    mask[ad_p <= alpha] = False

                else:
                    pvals[:,i] = pval
                    mask[pval <= alpha] = False

                Corr_Coeff[:,i] = corr_val[:]
                Corr_Coeff[:,i].mask = mask

            Corr_Coeff = np.ma.array(data = Corr_Coeff[:,:], mask = Corr_Coeff.mask[:,:])
            Corr_Coeff = Corr_Coeff.reshape(lat.size,lon.size,len(lags)).swapaxes(2,1).swapaxes(1,0)
            pvals = pvals.reshape(lat.size,lon.size,len(lags)).swapaxes(2,1).swapaxes(1,0)
            return Corr_Coeff, pvals
예제 #2
0
def get_lagged_ts(df_data, lag, keys=None):
    if keys is None:
        keys = df_data.columns[df_data.dtypes != bool]
    df_lagmask = []
    for s in df_data.index.levels[0]:
        lagmask = fc_utils.apply_shift_lag(df_data.loc[s][['TrainIsTrue', 'RV_mask']], lag)
        df_lagmask.append(lagmask)
    df_lagmask = pd.concat(df_lagmask, keys=df_data.index.levels[0])
    # persPDO = functions_pp.get_df_test(rgPDO.df_data[keys_ext+['TrainIsTrue']])[keys_ext]
    df_lag = df_data[df_lagmask['x_fit']]
    df_lag.index = df_data[df_lagmask['y_fit']].index
    return df_lag[keys].rename({k:k+f'_{lag}' for k in keys}, axis=1), df_lagmask
예제 #3
0
#%%
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)
#%% get Correlation between pattern and PDO

rg.list_for_MI[0].calc_ts = 'pattern cov'
rg.cluster_list_MI()
rg.get_ts_prec()

df_test = functions_pp.get_df_test(rg.df_data)

df_PDO_and_SST = df_PDOs.merge(df_test, left_index=True, right_index=True)[['PDO', '1..0..sst_sp']]


RV_mask = fc_utils.apply_shift_lag(rg.df_splits, 1)['x_pred'].loc[0]
df_PDO_and_SST = df_PDO_and_SST[RV_mask.values]
df_PDO_and_SST.corr()

# rg.cluster_list_MI() ; rg.get_ts_prec() ;
# out = rg.fit_df_data_ridge()

# s = 0
# X_pred = out[2]['lag_1'][f'split_{s}'].X_pred
# X_pred.index = df_prec.loc[s].index
# df = X_pred.merge(df_prec.loc[s], left_index=True, right_index=True)
# df = df.merge(PDO1.loc[s], left_index=True, right_index=True)
# df = rg.TV.RV_ts.merge(df, left_index=True, right_index=True)


예제 #4
0
def prepare_data(y_ts,
                 df_split,
                 lag_i=int,
                 dates_tobin=None,
                 precur_aggr=None,
                 normalize='datesRV',
                 remove_RV=True,
                 keys=None,
                 add_autocorr=True,
                 EOF=False,
                 expl_var=None):
    '''
    TrainisTrue     : Specifies train and test dates, col of df_split.
    RV_mask         : Specifies what data will be predicted, col of df_split.
    fit_model_dates : Deprecated... It can be desirable to train on
                      more dates than what you want to predict, col of df_split.
    remove_RV       : First column is the RV, and is removed.
    lag_i           : Mask for fitting and predicting will be shifted with
                      {lag_i} periods

    returns:
        df_norm     : Dataframe
        x_keys      : updated set of keys to fit model
    '''
    #%%
    # lag_i=fc.lags_i[0]
    # normalize='datesRV'
    # remove_RV=True
    # keys=None
    # add_autocorr=True
    # EOF=False
    # expl_var=None
    # =============================================================================
    # Select features / variables
    # =============================================================================
    if keys is None:
        keys = np.array(df_split.dtypes.index[df_split.dtypes != bool],
                        dtype='object')

    RV_name = df_split.columns[0]
    df_RV = df_split[RV_name]
    if remove_RV is True:
        # completely remove RV timeseries
        df_prec = df_split.drop([RV_name], axis=1).copy()
        keys = np.array([k for k in keys if k != RV_name], dtype='object')
    else:
        keys = np.array(keys, dtype='object')
        df_prec = df_split.copy()
    # not all keys are present in each split:
    keys = [k for k in keys if k in list(df_split.columns)]
    x_keys = np.array(keys, dtype='object')

    dates_TV = y_ts['cont'].index
    tfreq_TV = (dates_TV[1] - dates_TV[0]).days

    # if type(add_autocorr) is int: # not really a clue what this does
    #     adding_ac_mlag = lag_i <= 2
    # else:
    #     adding_ac_mlag = True

    if add_autocorr:

        if lag_i == 0 and precur_aggr is None:
            # minimal shift of lag 1 or it will follow shift with x_fit mask
            RV_ac = df_RV.shift(periods=-1).copy()
        elif precur_aggr is not None and lag_i < int(tfreq_TV / 2):

            # df_RV is daily and should be shifted more tfreq/2 otherwise just
            # predicting with the (part) of the observed ts.
            # I am selecting dates_min_lag, thus adding RV that is also shifted
            # min_lag days will result in that I am selecting the actual
            # observed ts.
            # lag  < tfreq_TV
            shift = tfreq_TV - lag_i
            RV_ac = df_RV.shift(periods=-shift).copy()
            # for lag_i = 0, tfreq_TV=10
            # 1979-06-15    7.549415 is now:
            # 1979-06-20    7.549415
            # when selecting value of 06-15, I am actually selecting val of 6-10
            # minimal shift of 10 days backward in time is realized
        else:
            RV_ac = df_RV.copy(
            )  # RV will shifted according fit_masks, lag will be > 1

        # plugging in the mean value for the last date if no data
        # is available to shift backward
        RV_ac.loc[RV_ac.isna()] = RV_ac.mean()

        df_prec.insert(0, 'autocorr', RV_ac)
        # add key to keys
        if 'autocorr' not in keys:
            x_keys = np.array(np.insert(x_keys, 0, 'autocorr'), dtype='object')

    # =============================================================================
    # Shifting data w.r.t. index dates
    # =============================================================================
    if dates_tobin is None:
        # we can only make lag steps of size tfreq, e.g. if df_data contains
        # 10 day means, we can make a lag_step of 1, 2, etc, resulting in lag
        # in days of 5, 15, 25.
        fit_masks = df_split.loc[:, ['TrainIsTrue', 'RV_mask']].copy()
        fit_masks = util.apply_shift_lag(fit_masks, lag_i)
    elif type(dates_tobin) == pd.DatetimeIndex:
        # df_data contain daily data, we can shift dates_tobin to allow any
        # lag in days w.r.t. target variable
        dates_TV = y_ts['cont'].index
        tfreq_TV = (dates_TV[1] - dates_TV[0]).days
        if lag_i == 0:
            base_lag = 0
        else:
            base_lag = int(tfreq_TV / 2)  # minimal shift to get lag vs onset
        last_centerdate = dates_TV[-1]
        fit_masks = df_split.loc[:, ['TrainIsTrue', 'RV_mask']].copy()
        fit_masks = util.apply_shift_lag(fit_masks, lag_i + base_lag)
        df_prec = df_prec[x_keys].merge(fit_masks,
                                        left_index=True,
                                        right_index=True)
        dates_bin_shift = functions_pp.func_dates_min_lag(
            dates_tobin, lag_i + base_lag)[1]

        df_prec, dates_tobin_p = _daily_to_aggr(df_prec.loc[dates_bin_shift],
                                                precur_aggr)
        fit_masks = df_prec[df_prec.columns[df_prec.dtypes == bool]]
        # check y_fit mask
        fit_masks = util._check_y_fitmask(fit_masks, lag_i, base_lag)
        lag_v = (last_centerdate - df_prec[df_prec['x_fit']].index[-1]).days
        if tfreq_TV == precur_aggr:
            assert lag_v == lag_i + base_lag, (
                f'lag center precur vs center TV is {lag_v} days, with '
                f'lag_i {lag_i} and base_lag {base_lag}')
    elif type(dates_tobin) is tuple:
        df_prec = start_end_date_mean(df_prec, start_end_date=dates_tobin)
        fit_masks = util.apply_shift_lag(df_prec[['TrainIsTrue', 'RV_mask']],
                                         0)

    df_prec = df_prec[x_keys]
    # =============================================================================
    # Normalize data using datesRV or all training data in dataframe
    # =============================================================================
    if normalize == 'all':
        # Normalize using all training dates
        TrainIsTrue = fit_masks['TrainIsTrue']
        df_prec[x_keys]  = (df_prec[x_keys] - df_prec[x_keys][TrainIsTrue].mean(0)) \
                / df_prec[x_keys][TrainIsTrue].std(0)
    elif normalize == 'datesRV' or normalize == True:
        # Normalize only using the RV dates
        TrainRV = np.logical_and(fit_masks['TrainIsTrue'],
                                 fit_masks['y_pred']).values
        df_prec[x_keys]  = (df_prec[x_keys] - df_prec[x_keys][TrainRV].mean(0)) \
                / df_prec[x_keys][TrainRV].std(0)
    elif normalize == 'x_fit':
        # Normalize only using the RV dates
        TrainRV = np.logical_and(fit_masks['TrainIsTrue'],
                                 fit_masks['x_fit']).values
        df_prec[x_keys]  = (df_prec[x_keys] - df_prec[x_keys][TrainRV].mean(0)) \
                / df_prec[x_keys][TrainRV].std(0)
    elif normalize == False:
        pass

    if EOF:
        if expl_var is None:
            expl_var = 0.75
        else:
            expl_var = expl_var
        df_prec = transform_EOF(df_prec,
                                fit_masks['TrainIsTrue'],
                                fit_masks['x_fit'],
                                expl_var=expl_var)
        df_prec.columns = df_prec.columns.astype(str)
        upd_keys = np.array(df_prec.columns.values.ravel(), dtype=str)
    else:
        upd_keys = x_keys

    # =============================================================================
    # Replace masks
    # =============================================================================
    df_prec = df_prec.merge(fit_masks, left_index=True, right_index=True)
    #%%
    return df_prec, upd_keys
예제 #5
0
    elif method == 'PC-like':
        df_links_s = df_pvals_fs.loc[s] <= alpha_CI
        df_str_s = df_str_fs.loc[s]

    ts_list = []
    df_MCIc_s = df_str_s
    df_data_s = rg.df_data.loc[s].copy()
    fit_masks = rg.df_data.loc[s][['TrainIsTrue', 'RV_mask']].copy()
    newfitmask = fit_masks[['TrainIsTrue', 'RV_mask']][fit_masks['RV_mask']]
    for i, k in enumerate(df_links_s.index):
        lags = df_links_s.loc[k][df_links_s.loc[k]].index
        if strongest_lag and len(lags) > 1:
            strngth = df_MCIc_s.loc[k][[f'coeff l{l}' for l in lags]].abs()
            lags = [int(strngth.idxmax()[-1])]
        for l in lags:
            m = fc_utils.apply_shift_lag(fit_masks, l)['x_fit']
            ts = df_data_s[[k]][m]
            ts.columns = [k.replace(k.split('..')[0], str(l))]
            ts.index = newfitmask.index
            ts_list.append(ts)
    df_s = pd.concat(ts_list, axis=1)
    df_s = df_s.merge(newfitmask, left_index=True, right_index=True)
    df_causal[s] = df_s
df_causal = pd.concat(df_causal, keys=range(n_spl))

# target
fc_mask = rg.df_data.iloc[:, -1].loc[0]  #.shift(lag, fill_value=False)
target_ts = rg.df_data.iloc[:, [0]].loc[0][fc_mask]
target_ts = (target_ts - target_ts.mean()) / target_ts.std()
# metrics
RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=float(target_ts.mean())).RMSE