def MI_single_split(RV_ts, precur_train, s, alpha=.05, FDR_control=True): lat = precur_train.latitude.values lon = precur_train.longitude.values z = np.zeros((lat.size*lon.size,len(lags) ) ) Corr_Coeff = np.ma.array(z, mask=z) pvals = np.ones((lat.size*lon.size,len(lags) ) ) dates_RV = RV_ts.index for i, lag in enumerate(lags): if type(lag) is np.int16 and self.lag_as_gap==False: # dates_lag = functions_pp.func_dates_min_lag(dates_RV, self._tfreq*lag)[1] m = apply_shift_lag(self.df_splits.loc[s], lag) dates_lag = m[np.logical_and(m['TrainIsTrue']==1, m['x_fit'])].index corr_val, pval = self.func(precur_train.sel(time=dates_lag), RV_ts.values.squeeze(), **self.kwrgs_func) elif type(lag) == np.int16 and self.lag_as_gap==True: # if only shift tfreq, then gap=0 datesdaily = RV_class.aggr_to_daily_dates(dates_RV, tfreq=self._tfreq) dates_lag = functions_pp.func_dates_min_lag(datesdaily, self._tfreq+lag)[1] tmb = functions_pp.time_mean_bins corr_val, pval = self.func(tmb(precur_train.sel(time=dates_lag), to_freq=self._tfreq)[0], RV_ts.values.squeeze(), **self.kwrgs_func) elif type(lag) == np.ndarray: corr_val, pval = self.func(precur_train.sel(lag=i), RV_ts.values.squeeze(), **self.kwrgs_func) mask = np.ones(corr_val.size, dtype=bool) if FDR_control == True: # test for Field significance and mask unsignificant values # FDR control: adjusted_pvalues = multicomp.multipletests(pval, method='fdr_bh') ad_p = adjusted_pvalues[1] pvals[:,i] = ad_p mask[ad_p <= alpha] = False else: pvals[:,i] = pval mask[pval <= alpha] = False Corr_Coeff[:,i] = corr_val[:] Corr_Coeff[:,i].mask = mask Corr_Coeff = np.ma.array(data = Corr_Coeff[:,:], mask = Corr_Coeff.mask[:,:]) Corr_Coeff = Corr_Coeff.reshape(lat.size,lon.size,len(lags)).swapaxes(2,1).swapaxes(1,0) pvals = pvals.reshape(lat.size,lon.size,len(lags)).swapaxes(2,1).swapaxes(1,0) return Corr_Coeff, pvals
def get_lagged_ts(df_data, lag, keys=None): if keys is None: keys = df_data.columns[df_data.dtypes != bool] df_lagmask = [] for s in df_data.index.levels[0]: lagmask = fc_utils.apply_shift_lag(df_data.loc[s][['TrainIsTrue', 'RV_mask']], lag) df_lagmask.append(lagmask) df_lagmask = pd.concat(df_lagmask, keys=df_data.index.levels[0]) # persPDO = functions_pp.get_df_test(rgPDO.df_data[keys_ext+['TrainIsTrue']])[keys_ext] df_lag = df_data[df_lagmask['x_fit']] df_lag.index = df_data[df_lagmask['y_fit']].index return df_lag[keys].rename({k:k+f'_{lag}' for k in keys}, axis=1), df_lagmask
#%% import matplotlib as mpl mpl.rcParams.update(mpl.rcParamsDefault) #%% get Correlation between pattern and PDO rg.list_for_MI[0].calc_ts = 'pattern cov' rg.cluster_list_MI() rg.get_ts_prec() df_test = functions_pp.get_df_test(rg.df_data) df_PDO_and_SST = df_PDOs.merge(df_test, left_index=True, right_index=True)[['PDO', '1..0..sst_sp']] RV_mask = fc_utils.apply_shift_lag(rg.df_splits, 1)['x_pred'].loc[0] df_PDO_and_SST = df_PDO_and_SST[RV_mask.values] df_PDO_and_SST.corr() # rg.cluster_list_MI() ; rg.get_ts_prec() ; # out = rg.fit_df_data_ridge() # s = 0 # X_pred = out[2]['lag_1'][f'split_{s}'].X_pred # X_pred.index = df_prec.loc[s].index # df = X_pred.merge(df_prec.loc[s], left_index=True, right_index=True) # df = df.merge(PDO1.loc[s], left_index=True, right_index=True) # df = rg.TV.RV_ts.merge(df, left_index=True, right_index=True)
def prepare_data(y_ts, df_split, lag_i=int, dates_tobin=None, precur_aggr=None, normalize='datesRV', remove_RV=True, keys=None, add_autocorr=True, EOF=False, expl_var=None): ''' TrainisTrue : Specifies train and test dates, col of df_split. RV_mask : Specifies what data will be predicted, col of df_split. fit_model_dates : Deprecated... It can be desirable to train on more dates than what you want to predict, col of df_split. remove_RV : First column is the RV, and is removed. lag_i : Mask for fitting and predicting will be shifted with {lag_i} periods returns: df_norm : Dataframe x_keys : updated set of keys to fit model ''' #%% # lag_i=fc.lags_i[0] # normalize='datesRV' # remove_RV=True # keys=None # add_autocorr=True # EOF=False # expl_var=None # ============================================================================= # Select features / variables # ============================================================================= if keys is None: keys = np.array(df_split.dtypes.index[df_split.dtypes != bool], dtype='object') RV_name = df_split.columns[0] df_RV = df_split[RV_name] if remove_RV is True: # completely remove RV timeseries df_prec = df_split.drop([RV_name], axis=1).copy() keys = np.array([k for k in keys if k != RV_name], dtype='object') else: keys = np.array(keys, dtype='object') df_prec = df_split.copy() # not all keys are present in each split: keys = [k for k in keys if k in list(df_split.columns)] x_keys = np.array(keys, dtype='object') dates_TV = y_ts['cont'].index tfreq_TV = (dates_TV[1] - dates_TV[0]).days # if type(add_autocorr) is int: # not really a clue what this does # adding_ac_mlag = lag_i <= 2 # else: # adding_ac_mlag = True if add_autocorr: if lag_i == 0 and precur_aggr is None: # minimal shift of lag 1 or it will follow shift with x_fit mask RV_ac = df_RV.shift(periods=-1).copy() elif precur_aggr is not None and lag_i < int(tfreq_TV / 2): # df_RV is daily and should be shifted more tfreq/2 otherwise just # predicting with the (part) of the observed ts. # I am selecting dates_min_lag, thus adding RV that is also shifted # min_lag days will result in that I am selecting the actual # observed ts. # lag < tfreq_TV shift = tfreq_TV - lag_i RV_ac = df_RV.shift(periods=-shift).copy() # for lag_i = 0, tfreq_TV=10 # 1979-06-15 7.549415 is now: # 1979-06-20 7.549415 # when selecting value of 06-15, I am actually selecting val of 6-10 # minimal shift of 10 days backward in time is realized else: RV_ac = df_RV.copy( ) # RV will shifted according fit_masks, lag will be > 1 # plugging in the mean value for the last date if no data # is available to shift backward RV_ac.loc[RV_ac.isna()] = RV_ac.mean() df_prec.insert(0, 'autocorr', RV_ac) # add key to keys if 'autocorr' not in keys: x_keys = np.array(np.insert(x_keys, 0, 'autocorr'), dtype='object') # ============================================================================= # Shifting data w.r.t. index dates # ============================================================================= if dates_tobin is None: # we can only make lag steps of size tfreq, e.g. if df_data contains # 10 day means, we can make a lag_step of 1, 2, etc, resulting in lag # in days of 5, 15, 25. fit_masks = df_split.loc[:, ['TrainIsTrue', 'RV_mask']].copy() fit_masks = util.apply_shift_lag(fit_masks, lag_i) elif type(dates_tobin) == pd.DatetimeIndex: # df_data contain daily data, we can shift dates_tobin to allow any # lag in days w.r.t. target variable dates_TV = y_ts['cont'].index tfreq_TV = (dates_TV[1] - dates_TV[0]).days if lag_i == 0: base_lag = 0 else: base_lag = int(tfreq_TV / 2) # minimal shift to get lag vs onset last_centerdate = dates_TV[-1] fit_masks = df_split.loc[:, ['TrainIsTrue', 'RV_mask']].copy() fit_masks = util.apply_shift_lag(fit_masks, lag_i + base_lag) df_prec = df_prec[x_keys].merge(fit_masks, left_index=True, right_index=True) dates_bin_shift = functions_pp.func_dates_min_lag( dates_tobin, lag_i + base_lag)[1] df_prec, dates_tobin_p = _daily_to_aggr(df_prec.loc[dates_bin_shift], precur_aggr) fit_masks = df_prec[df_prec.columns[df_prec.dtypes == bool]] # check y_fit mask fit_masks = util._check_y_fitmask(fit_masks, lag_i, base_lag) lag_v = (last_centerdate - df_prec[df_prec['x_fit']].index[-1]).days if tfreq_TV == precur_aggr: assert lag_v == lag_i + base_lag, ( f'lag center precur vs center TV is {lag_v} days, with ' f'lag_i {lag_i} and base_lag {base_lag}') elif type(dates_tobin) is tuple: df_prec = start_end_date_mean(df_prec, start_end_date=dates_tobin) fit_masks = util.apply_shift_lag(df_prec[['TrainIsTrue', 'RV_mask']], 0) df_prec = df_prec[x_keys] # ============================================================================= # Normalize data using datesRV or all training data in dataframe # ============================================================================= if normalize == 'all': # Normalize using all training dates TrainIsTrue = fit_masks['TrainIsTrue'] df_prec[x_keys] = (df_prec[x_keys] - df_prec[x_keys][TrainIsTrue].mean(0)) \ / df_prec[x_keys][TrainIsTrue].std(0) elif normalize == 'datesRV' or normalize == True: # Normalize only using the RV dates TrainRV = np.logical_and(fit_masks['TrainIsTrue'], fit_masks['y_pred']).values df_prec[x_keys] = (df_prec[x_keys] - df_prec[x_keys][TrainRV].mean(0)) \ / df_prec[x_keys][TrainRV].std(0) elif normalize == 'x_fit': # Normalize only using the RV dates TrainRV = np.logical_and(fit_masks['TrainIsTrue'], fit_masks['x_fit']).values df_prec[x_keys] = (df_prec[x_keys] - df_prec[x_keys][TrainRV].mean(0)) \ / df_prec[x_keys][TrainRV].std(0) elif normalize == False: pass if EOF: if expl_var is None: expl_var = 0.75 else: expl_var = expl_var df_prec = transform_EOF(df_prec, fit_masks['TrainIsTrue'], fit_masks['x_fit'], expl_var=expl_var) df_prec.columns = df_prec.columns.astype(str) upd_keys = np.array(df_prec.columns.values.ravel(), dtype=str) else: upd_keys = x_keys # ============================================================================= # Replace masks # ============================================================================= df_prec = df_prec.merge(fit_masks, left_index=True, right_index=True) #%% return df_prec, upd_keys
elif method == 'PC-like': df_links_s = df_pvals_fs.loc[s] <= alpha_CI df_str_s = df_str_fs.loc[s] ts_list = [] df_MCIc_s = df_str_s df_data_s = rg.df_data.loc[s].copy() fit_masks = rg.df_data.loc[s][['TrainIsTrue', 'RV_mask']].copy() newfitmask = fit_masks[['TrainIsTrue', 'RV_mask']][fit_masks['RV_mask']] for i, k in enumerate(df_links_s.index): lags = df_links_s.loc[k][df_links_s.loc[k]].index if strongest_lag and len(lags) > 1: strngth = df_MCIc_s.loc[k][[f'coeff l{l}' for l in lags]].abs() lags = [int(strngth.idxmax()[-1])] for l in lags: m = fc_utils.apply_shift_lag(fit_masks, l)['x_fit'] ts = df_data_s[[k]][m] ts.columns = [k.replace(k.split('..')[0], str(l))] ts.index = newfitmask.index ts_list.append(ts) df_s = pd.concat(ts_list, axis=1) df_s = df_s.merge(newfitmask, left_index=True, right_index=True) df_causal[s] = df_s df_causal = pd.concat(df_causal, keys=range(n_spl)) # target fc_mask = rg.df_data.iloc[:, -1].loc[0] #.shift(lag, fill_value=False) target_ts = rg.df_data.iloc[:, [0]].loc[0][fc_mask] target_ts = (target_ts - target_ts.mean()) / target_ts.std() # metrics RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=float(target_ts.mean())).RMSE