def CPPA_precursor_regions(path_data, keys_options=['CPPA']): #%% dict_of_dfs = func_fc.load_hdf5(path_data) df_data = dict_of_dfs['df_data'] splits = df_data.index.levels[0] skip = ['TrainIsTrue', 'RV_mask'] keys_d = {} for option in keys_options: keys_d_ = {} for s in splits: if option == 'robust': not_robust = [ '0_101_PEPspatcov', 'PDO', 'ENSO_34', 'ENSO_34', 'PDO' ] all_keys = df_data.loc[s].columns[1:] all_keys = [k for k in all_keys if k not in skip] all_keys = [k for k in all_keys if k not in not_robust] robust = ['0_100_CPPAspatcov', '2', '7', '9'] sst_regs = [k for k in all_keys if len(k.split('_')) == 3] other = [k for k in all_keys if len(k.split('_')) != 3] keys_ = [k for k in sst_regs if k.split('_')[1] in robust] [keys_.append(k) for k in other] elif option == 'CPPA': not_robust = [ '0_101_PEPspatcov', '0_104_PDO', '0_103_ENSO34', 'ENSO_34', 'PDO', '0_900_ENSO34', '0_901_PDO' ] all_keys = df_data.loc[s].columns[1:] all_keys = [k for k in all_keys if k not in skip] all_keys = [k for k in all_keys if k not in not_robust] keys_ = all_keys elif option == 'PEP': all_keys = df_data.loc[s].columns[1:] all_keys = [k for k in all_keys if k not in skip] keys_ = [ k for k in all_keys if k.split('_')[-1] == 'PEPspatcov' ] keys_d_[s] = np.unique(keys_) keys_d[option] = keys_d_ #%% return keys_d
def import_precur_ts(import_prec_ts, df_splits, to_freq, start_end_date, start_end_year): ''' import_prec_ts has format tuple (name, path_data) ''' splits = df_splits.index.levels[0] df_data_ext_s = np.zeros( (splits.size) , dtype=object) counter = 0 for i, (name, path_data) in enumerate(import_prec_ts): for s in range(splits.size): # skip first col because it is the RV ts df_data_e = func_fc.load_hdf5(path_data)['df_data'].iloc[:,1:].loc[s] cols_ts = np.logical_or(df_data_e.dtypes == 'float64', df_data_e.dtypes == 'float32') cols_ext = list(df_data_e.columns[cols_ts]) # cols_ext must be of format '{}_{int}_{}' lab_int = 100 for i, c in enumerate(cols_ext): char = c.split('_')[1] if char.isdigit(): pass else: cols_ext[i] = c.replace(char, str(lab_int)) + char lab_int += 1 df_data_ext_s[s] = df_data_e[cols_ext] tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days if to_freq != tfreq_date_e: try: df_data_ext_s[s] = functions_pp.time_mean_bins(df_data_ext_s[s], to_freq, start_end_date, start_end_year)[0] except KeyError as e: print('KeyError captured, likely the requested dates ' 'given by start_end_date and start_end_year are not' 'found in external pandas timeseries.\n{}'.format(str(e))) if counter == 0: df_data_ext = pd.concat(list(df_data_ext_s), keys=range(splits.size)) else: df_data_ext.merge(df_data_ext, left_index=True, right_index=True) return df_data_ext
def RV_and_traintest(fullts, TV_ts, method=str, kwrgs_events=None, precursor_ts=None, seed=int, verbosity=1): # Define traintest: df_RVfullts = pd.DataFrame(fullts.values, index=pd.to_datetime(fullts.time.values)) df_RV_ts = pd.DataFrame(TV_ts.values, index=pd.to_datetime(TV_ts.time.values)) if method[:9] == 'ran_strat': kwrgs_events = kwrgs_events TV = classes.RV_class(df_RVfullts, df_RV_ts, kwrgs_events) else: TV = classes.RV_class(df_RVfullts, df_RV_ts) if precursor_ts is not None: # Retrieve same train test split as imported ts path_data = ''.join(precursor_ts[0][1]) df_splits = func_fc.load_hdf5( path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']] test_yrs_imp = functions_pp.get_testyrs(df_splits) df_splits = functions_pp.rand_traintest_years( TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) test_yrs_set = functions_pp.get_testyrs(df_splits) assert (np.equal(test_yrs_imp, test_yrs_set)).all(), "Train test split not equal" else: df_splits = functions_pp.rand_traintest_years( TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) return TV, df_splits
def RV_and_traintest(fullts, TV_ts, method=str, kwrgs_events=None, precursor_ts=None, seed=int, verbosity=1): # Define traintest: df_fullts = pd.DataFrame(fullts.values, index=pd.to_datetime(fullts.time.values)) df_RV_ts = pd.DataFrame(TV_ts.values, index=pd.to_datetime(TV_ts.time.values)) if method[:9] == 'ran_strat' and kwrgs_events is None: # events need to be defined to enable stratified traintest. kwrgs_events = {'event_percentile': 66, 'min_dur' : 1, 'max_break' : 0, 'grouped' : False} if verbosity == 1: print("kwrgs_events not given, creating stratified traintest split " "based on events defined as exceeding the {}th percentile".format( kwrgs_events['event_percentile'])) TV = RV_class(df_fullts, df_RV_ts, kwrgs_events) if precursor_ts is not None: # Retrieve same train test split as imported ts path_data = ''.join(precursor_ts[0][1]) df_splits = func_fc.load_hdf5(path_data)['df_data'].loc[:,['TrainIsTrue', 'RV_mask']] test_yrs_imp = functions_pp.get_testyrs(df_splits) df_splits = functions_pp.rand_traintest_years(TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) test_yrs_set = functions_pp.get_testyrs(df_splits) assert (np.equal(test_yrs_imp, test_yrs_set)).all(), "Train test split not equal" else: df_splits = functions_pp.rand_traintest_years(TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) return TV, df_splits
def forecast_wrapper(datasets=dict, kwrgs_exp=dict, kwrgs_events=dict, stat_model_l=list, lags_i=list, n_boot=0): ''' dict should have splits (as keys) and concomitant list of keys of that particular split ''' df_data = func_fc.load_hdf5(path_data)['df_data'] splits = df_data.index.levels[0] RVfullts = pd.DataFrame(df_data[df_data.columns[0]][0]) RV_ts = pd.DataFrame(df_data[df_data.columns[0]][0][df_data['RV_mask'][0]]) fit_model_dates = kwrgs_exp['kwrgs_pp']['fit_model_dates'] RV = func_fc.RV_class(RVfullts, RV_ts, kwrgs_events, fit_model_dates=fit_model_dates) RV.TrainIsTrue = df_data['TrainIsTrue'] RV.RV_mask = df_data['RV_mask'] fit_model_mask = pd.concat([RV.fit_model_mask] * 10, keys=splits) df_data = df_data.merge(fit_model_mask, left_index=True, right_index=True) RV.prob_clim = func_fc.get_obs_clim(RV) dict_sum = {} for stat_model in stat_model_l: name = stat_model[0] df_valid, RV, y_pred_all = func_fc.forecast_and_valid( RV, df_data, kwrgs_exp, stat_model=stat_model, lags_i=lags_i, n_boot=n_boot) dict_sum[name] = (df_valid, RV, y_pred_all) return dict_sum
def run_PCMCI(ex, outdic_actors, s, df_splits, map_proj): #===================================================================================== # # 4) PCMCI-algorithm # #===================================================================================== # save output if ex['SaveTF'] == True: # from contextlib import redirect_stdout orig_stdout = sys.stdout # buffer print statement output to f if sys.version[:1] == '3': sys.stdout = f = io.StringIO() elif sys.version[:1] == '2': sys.stdout = f = open(os.path.join(ex['fig_subpath'], 'old.txt'), 'w+') #%% # amount of text printed: verbosity = 3 # alpha level for independence test within the pc procedure (finding parents) pc_alpha = ex['pcA_sets'][ex['pcA_set']] # alpha level for multiple linear regression model while conditining on parents of # parents alpha_level = ex['alpha_level_tig'] print('run tigramite 4, run.pcmci') print(('alpha level(s) for independence tests within the pc procedure' '(finding parents): {}'.format(pc_alpha))) print(( 'alpha level for multiple linear regression model while conditining on parents of ' 'parents: {}'.format(ex['alpha_level_tig']))) # Retrieve traintest info traintest = df_splits # load Response Variable class RV = ex[ex['RV_name']] # create list with all actors, these will be merged into the fulldata array allvar = ex['vars'][0] var_names_corr = [] actorlist = [] cols = [[RV.name]] for var in allvar[:]: print(var) actor = outdic_actors[var] if actor.ts_corr[s].size != 0: ts_train = actor.ts_corr[s].values actorlist.append(ts_train) # create array which numbers the regions var_idx = allvar.index(var) n_regions = actor.ts_corr[s].shape[1] actor.var_info = [[i + 1, actor.ts_corr[s].columns[i], var_idx] for i in range(n_regions)] # Array of corresponing regions with var_names_corr (first entry is RV) var_names_corr = var_names_corr + actor.var_info cols.append(list(actor.ts_corr[s].columns)) index_dates = actor.ts_corr[s].index var_names_corr.insert(0, RV.name) # stack actor time-series together: fulldata = np.concatenate(tuple(actorlist), axis=1) print(('There are {} regions in total'.format(fulldata.shape[1]))) # add the full 1D time series of interest as first entry: fulldata = np.column_stack((RV.RVfullts, fulldata)) df_data = pd.DataFrame(fulldata, columns=flatten(cols), index=index_dates) if ex['import_prec_ts'] == True: var_names_full = var_names_corr.copy() for d in ex['precursor_ts']: path_data = d[1] if len(path_data) > 1: path_data = ''.join(list(path_data)) # skip first col because it is the RV ts df_data_ext = func_fc.load_hdf5( path_data)['df_data'].iloc[:, 1:].loc[s] cols_ts = np.logical_or(df_data_ext.dtypes == 'float64', df_data_ext.dtypes == 'float32') cols_ext = list(df_data_ext.columns[cols_ts]) # cols_ext must be of format '{}_{int}_{}' lab_int = 100 for i, c in enumerate(cols_ext): char = c.split('_')[1] if char.isdigit(): pass else: cols_ext[i] = c.replace(char, str(lab_int)) + char lab_int += 1 df_data_ext = df_data_ext[cols_ext] to_freq = ex['tfreq'] if to_freq != 1: start_end_date = (ex['sstartdate'], ex['senddate']) start_end_year = (ex['startyear'], ex['endyear']) df_data_ext = functions_pp.time_mean_bins(df_data_ext, to_freq, start_end_date, start_end_year, seldays='part')[0] # df_data_ext = functions_pp.time_mean_bins(df_data_ext, # ex, ex['tfreq'], # seldays='part')[0] # Expand var_names_corr n = var_names_full[-1][0] + 1 add_n = n + len(cols_ext) n_var_idx = var_names_full[-1][-1] + 1 for i in range(n, add_n): var_names_full.append([i, cols_ext[i - n], n_var_idx]) df_data = df_data.merge(df_data_ext, left_index=True, right_index=True) else: var_names_full = var_names_corr bool_train = traintest.loc[s]['TrainIsTrue'] bool_RV_train = np.logical_and(bool_train, traintest.loc[s]['RV_mask']) dates_train = traintest.loc[s]['TrainIsTrue'][bool_train].index dates_RV_train = traintest.loc[s]['TrainIsTrue'][bool_RV_train].index RVfull_train = RV.RVfullts.sel(time=dates_train) datesfull_train = pd.to_datetime(RVfull_train.time.values) data = df_data.loc[datesfull_train].values print((data.shape)) # get RV datamask (same shape als data) data_mask = [ True if d in dates_RV_train else False for d in datesfull_train ] data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape) # add traintest mask to fulldata # dates_all = pd.to_datetime(RV.RVfullts.index) # dates_RV = pd.to_datetime(RV.RV_ts.index) dates_all = pd.to_datetime(RV.RVfullts.time.values) dates_RV = pd.to_datetime(RV.RV_ts.time.values) df_data['TrainIsTrue'] = [ True if d in datesfull_train else False for d in dates_all ] df_data['RV_mask'] = [True if d in dates_RV else False for d in dates_all] # ====================================================================================================================== # tigramite 3 # ====================================================================================================================== T, N = data.shape # Time, Regions # ====================================================================================================================== # Initialize dataframe object (needed for tigramite functions) # ====================================================================================================================== dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names_full) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== parcorr = ParCorr(significance='analytic', mask_type='y', verbosity=verbosity) #========================================================================== # multiple testing problem: #========================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, selected_variables=None, verbosity=4) # selected_variables : list of integers, optional (default: range(N)) # Specify to estimate parents only for selected variables. If None is # passed, parents are estimated for all variables. # ====================================================================================================================== #selected_links = dictionary/None results = pcmci.run_pcmci(tau_max=ex['tigr_tau_max'], pc_alpha=pc_alpha, tau_min=0, max_combinations=ex['max_comb_actors']) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci.print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) # returns all parents, not just causal precursors (of lag>0) sig = rgcpd.return_sign_parents(pcmci, pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) all_parents = sig['parents'] # link_matrix = sig['link_matrix'] links_RV = all_parents[0] df = rgcpd.bookkeeping_precursors(links_RV, var_names_full) #%% rgcpd.print_particular_region_new(links_RV, var_names_corr, s, outdic_actors, map_proj, ex) #%% if ex['SaveTF'] == True: if sys.version[:1] == '3': fname = f's{s}_' + ex['params'] + '.txt' file = io.open(os.path.join(ex['fig_subpath'], fname), mode='w+') file.write(f.getvalue()) file.close() f.close() elif sys.version[:1] == '2': f.close() sys.stdout = orig_stdout return df, df_data
def normal_precursor_regions(path_data, keys_options=['all'], causal=False): #%% ''' keys_options=['all', 'only_db_regs', 'sp_and_regs', 'sst+sm+RWT', 'sst(CPPA)+sm', 'sst(PEP)+sm', 'sst(PDO,ENSO)+sm', 'sst(CPPA)'] ''' dict_of_dfs = func_fc.load_hdf5(path_data) df_data = dict_of_dfs['df_data'] splits = df_data.index.levels[0] try: df_sum = dict_of_dfs['df_sum'] except: pass # skip = ['all_spatcov', '0_2_sm123', '0_101_PEPspatcov', 'sm123_spatcov'] skip = ['all_spatcov'] keys_d = {} for option in keys_options: keys_d_ = {} for s in splits: if causal == True or 'causal' in option: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False and 'causal' not in option: # correlated df_s = df_data.loc[s] all_keys = df_s.columns.delete(0) # extract only float columns mask_f = np.logical_or(df_s.dtypes == 'float64', df_s.dtypes == 'float32') all_keys = all_keys[mask_f[1:].values] # remove spatcov_causals all_keys = [k for k in all_keys if k[-4:] != 'caus'] if option == 'all': # extract only float columns keys_ = [k for k in all_keys if k not in skip] elif 'only_db_regs' in option: # Regions + all_spatcov(_caus) keys_ = [k for k in all_keys if ('spatcov' not in k)] keys_ = [k for k in keys_ if k not in skip] elif option == 'sp_and_regs': keys_ = [k for k in all_keys if k not in skip] elif option == 'sst(CPPA)': skip_ex = [ '0_900_ENSO34', '0_901_PDO', '0_101_PEPspatcov', 'sm123_spatcov', 'all_spatcov' ] keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if 'sm' not in k] keys_ = [k for k in keys_ if k not in skip_ex] elif option == 'sst+sm+RWT': keys_ = [k for k in all_keys if k[-7:] != 'v200hpa'] keys_ = [k for k in keys_ if k not in skip] elif option == 'sst(CPPA)+sm': skip_ex = [ '0_900_ENSO34', '0_901_PDO', '0_101_PEPspatcov', 'sm123_spatcov', 'all_spatcov' ] keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if k not in skip] keys_ = [k for k in keys_ if k not in skip_ex] elif option == 'CPPAregs+sm': skip_ex = ['0_900_ENSO34', '0_901_PDO'] keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if k not in skip] keys_ = [k for k in keys_ if ('spatcov' not in k)] keys_ = [k for k in keys_ if k not in skip_ex] elif option == 'CPPApattern+sm': skip_ex = ['0_900_ENSO34', '0_901_PDO'] keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if k not in skip] keys_ = [k for k in keys_ if ('spatcov' in k or 'sm' in k)] elif option == 'sm': keys_ = [k for k in all_keys if 'sm' in k] keys_ = [k for k in keys_ if 'spatcov' not in k] elif option == 'sst(PEP)+sm': keys_ = [k for k in all_keys if 'sm' in k or 'PEP' in k] keys_ = [k for k in keys_ if k != 'sm123_spatcov'] elif option == 'sst(PDO,ENSO)+sm': keys_ = [ k for k in all_keys if 'sm' in k or 'PDO' in k or 'ENSO' in k ] keys_ = [k for k in keys_ if 'spatcov' not in k] keys_d_[s] = np.unique(keys_) keys_d[option] = keys_d_ #%% return keys_d
def compare_use_spatcov(path_data, causal=True): #%% dict_of_dfs = func_fc.load_hdf5(path_data) df_data = dict_of_dfs['df_data'] splits = df_data.index.levels[0] df_sum = dict_of_dfs['df_sum'] keys_d = {} keys_d_ = {} for s in splits: if causal == True: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False: # correlated all_keys = df_sum.loc[s].index.delete(0) # Regions + all_spatcov(_caus) keys_ = [ k for k in all_keys if ('spatcov_caus' in k) and k[:3] == 'all' ] keys_d_[s] = np.array((keys_)) keys_d['Only_all_spatcov'] = keys_d_ keys_d_ = {} for s in splits: if causal == True: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False: # correlated all_keys = df_sum.loc[s].index.delete(0) # Regions + all_spatcov(_caus) keys_ = [ k for k in all_keys if ('spatcov_caus' not in k) or k[:3] == 'all' ] keys_d_[s] = np.array((keys_)) keys_d['Regions_all_spatcov'] = keys_d_ keys_d_ = {} for s in splits: if causal == True: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False: # correlated all_keys = df_sum.loc[s].index.delete(0) # only spatcov(_caus) (no all_spatcov) keys_ = [k for k in all_keys if ('spatcov' in k) and ('all' not in k)] keys_d_[s] = np.array((keys_)) keys_d['only_sp_caus_no_all_sp'] = keys_d_ keys_d_ = {} for s in splits: if causal == True: # causal all_keys = df_sum[df_sum['causal']].loc[s].index elif causal == False: # correlated all_keys = df_sum.loc[s].index.delete(0) # only spatcov(_caus) (no all_spatcov) keys_ = [k for k in all_keys if ('all' not in k)] keys_d_[s] = np.array((keys_)) keys_d['Regions_sp_caus_no_all_sp'] = keys_d_ #%% return keys_d
n_ev = int(p * n_steps) rand_true = np.zeros((n_steps)) ind = np.random.choice(range(n_steps), n_ev) rand_true[ind] = 1 rand_scores.append(brier_score_loss(rand_true, np.repeat(p, n_steps))) plt.plot(np.linspace(0, 1, 19), rand_scores) #%% Keep forecast the same, change event definition import func_fc import validation as valid import classes import pandas as pd path_fig = '/Users/semvijverberg/surfdrive/MckinRepl/era5_T2mmax_sst_Northern/ran_strat10_s30/figures' path_data = '/Users/semvijverberg/surfdrive/RGCPD_mcKinnon/t2mmax_E-US_sm123_m01-09_dt10/18jun-17aug_lag0-0_ran_strat10_s30/pcA_none_ac0.05_at0.05_subinfo/fulldata_pcA_none_ac0.05_at0.05_2019-09-24.h5' df_data = func_fc.load_hdf5(path_data)['df_data'] splits = df_data.index.levels[0] RVfullts = pd.DataFrame(df_data[df_data.columns[0]][0]) RV_ts = pd.DataFrame(df_data[df_data.columns[0]][0][df_data['RV_mask'][0]]) thresholds = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99] blocksize = valid.get_bstrap_size(RV.RVfullts, plot=False) metric = 'AUC-ROC' df_list = [] scores = [] for r in thresholds: kwrgs_events = { 'event_percentile': r, 'min_dur': 1, 'max_break': 0, 'grouped': False
def normal_precursor_regions(path_data, keys_options=['all'], causal=True): #%% ''' keys_options=['all', 'only_db_regs', 'sp_and_regs', 'sst+sm+RWT', 'sst(CPPA)+sm', 'sst(PEP)+sm', 'sst(PDO,ENSO)+sm'] ''' dict_of_dfs = func_fc.load_hdf5(path_data) df_data = dict_of_dfs['df_data'] splits = df_data.index.levels[0] df_sum = dict_of_dfs['df_sum'] skip = ['all_spatcov', '0_2_sm123', '0_101_PEPspatcov', 'sm123_spatcov'] keys_d = {} for option in keys_options: keys_d_ = {} for s in splits: if causal == True: # causal keys_ = df_sum[df_sum['causal']].loc[s].index elif causal == False: # correlated all_keys = df_sum.loc[s].index.delete(0) # remove spatcov_causals all_keys = [k for k in all_keys if k[-4:] != 'caus'] if option == 'all': keys_ = [k for k in all_keys if k not in skip] elif option == 'only_db_regs': # Regions + all_spatcov(_caus) keys_ = [k for k in all_keys if ('spatcov' not in k)] keys_ = [k for k in keys_ if k not in skip] elif option == 'sp_and_regs': keys_ = [k for k in all_keys if k not in skip] elif option == 'sst+sm+RWT': keys_ = [k for k in all_keys if k[-7:] != 'v200hpa'] keys_ = [k for k in keys_ if k not in skip] elif option == 'sst(CPPA)+sm': skip_ex = ['0_900_ENSO34', '0_901_PDO'] keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if k not in skip] keys_ = [k for k in keys_ if k not in skip_ex] elif option == 'CPPAregs+sm': skip_ex = ['0_900_ENSO34', '0_901_PDO'] keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if k not in skip] keys_ = [k for k in keys_ if ('spatcov' not in k)] keys_ = [k for k in keys_ if k not in skip_ex] elif option == 'CPPApattern+sm': skip_ex = ['0_900_ENSO34', '0_901_PDO'] keys_ = [k for k in all_keys if 'v200hpa' not in k] keys_ = [k for k in keys_ if k not in skip] keys_ = [k for k in keys_ if ('spatcov' in k or 'sm' in k)] elif option == 'sst(PEP)+sm': keys_ = [k for k in all_keys if 'sm' in k or 'PEP' in k] keys_ = [k for k in keys_ if k != 'sm123_spatcov'] elif option == 'sst(PDO,ENSO)+sm': keys_ = [k for k in all_keys if 'sm' in k or 'PDO' in k or 'ENSO' in k] keys_ = [k for k in keys_ if 'spatcov' not in k] keys_d_[s] = np.array(list(unique_everseen(keys_))) keys_d[option] = keys_d_ #%% return keys_d
'min_dur': 1, 'max_break': 0, 'grouped': False } dict_experiments = {} for dataset, tuple_sett in experiments.items(): ''' Format output is dict( exper_name = dict( model=tuple(df_valid, RV, y_pred) ) ) ''' path_data = tuple_sett[0] kwrgs_exp = tuple_sett[1] dict_of_dfs = func_fc.load_hdf5(path_data) df_data = dict_of_dfs['df_data'] splits = df_data.index.levels[0] tfreq = (df_data.loc[0].index[1] - df_data.loc[0].index[0]).days if tfreq == 1: lags_i = np.arange(0, 70 + 1E-9, max(10, tfreq), dtype=int) else: lags_i = np.array(np.arange(0, 70 + 1E-9, max(10, tfreq)) / max(10, tfreq), dtype=int) # lags_i = np.array([0], dtype=int) if 'keys' not in kwrgs_exp: # if keys not defined, getting causal keys kwrgs_exp['keys'] = exp_fc.normal_precursor_regions(
def calculate_corr_maps(ex, map_proj): #%% # ============================================================================= # Load 'exp' dictionairy with information of pre-processed data (variables, paths, filenames, etcetera..) # and add RGCPD/Tigrimate experiment settings # ============================================================================= # Response Variable is what we want to predict RV = ex[ex['RV_name']] ex['time_cycle'] = RV.dates[ RV.dates.year == RV.startyear].size # time-cycle of data. total timesteps in one year ex['time_range_all'] = [0, RV.dates.size] #================================================================================== # Start of experiment #================================================================================== # Define traintest: df_RVfullts = pd.DataFrame(RV.RVfullts.values, index=pd.to_datetime(RV.RVfullts.time.values)) df_RV_ts = pd.DataFrame(RV.RV_ts.values, index=pd.to_datetime(RV.RV_ts.time.values)) if ex['method'][:9] == 'ran_strat': kwrgs_events = ex['kwrgs_events'] RV = func_fc.RV_class(df_RVfullts, df_RV_ts, kwrgs_events) else: RV = func_fc.RV_class(df_RVfullts, df_RV_ts) if ex['import_prec_ts']: # Retrieve same train test split as imported ts path_data = ''.join(ex['precursor_ts'][0][1]) df_splits = func_fc.load_hdf5( path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']] test_yrs = functions_pp.get_testyrs(df_splits) df_splits, ex = functions_pp.rand_traintest_years(RV, ex, test_yrs) assert (np.equal(test_yrs, ex['tested_yrs'])).all(), "Train test split not equal" else: df_splits, ex = functions_pp.rand_traintest_years(RV, ex) # ============================================================================= # 2) DEFINE PRECURSOS COMMUNITIES: # ============================================================================= # - calculate and plot pattern correltion for differnt fields # - create time-series over these regions #===================================================================================== outdic_actors = dict() class act: def __init__(self, name, corr_xr, precur_arr): self.name = var self.corr_xr = corr_xr self.precur_arr = precur_arr self.lat_grid = precur_arr.latitude.values self.lon_grid = precur_arr.longitude.values self.area_grid = rgcpd.get_area(precur_arr) self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0]) allvar = ex['vars'][0] # list of all variable names for var in allvar[ex['excludeRV']:]: # loop over all variables actor = ex[var] #=========================================== # 3c) Precursor field #=========================================== file_path = os.path.join(actor.path_pp, actor.filename_pp) precur_arr = functions_pp.import_ds_timemeanbins(file_path, ex) # precur_arr = rgcpd.convert_longitude(precur_arr, 'only_east') # ============================================================================= # Calculate correlation # ============================================================================= corr_xr = rgcpd.calc_corr_coeffs_new(precur_arr, RV, ex) # ============================================================================= # Cluster into precursor regions # ============================================================================= actor = act(var, corr_xr, precur_arr) actor, ex = rgcpd.cluster_DBSCAN_regions(actor, ex) if np.isnan(actor.prec_labels.values).all() == False: rgcpd.plot_regs_xarray(actor.prec_labels.copy(), ex) outdic_actors[var] = actor # ============================================================================= # Plot # ============================================================================= if ex['plotin1fig'] == False: plot_maps.plot_corr_maps(corr_xr, corr_xr['mask'], map_proj) fig_filename = '{}_corr_{}_vs_{}'.format( ex['params'], ex['RV_name'], var) + ex['file_type2'] plt.savefig(os.path.join(ex['fig_path'], fig_filename), bbox_inches='tight', dpi=200) if ex['showplot'] == False: plt.close() #%% return ex, outdic_actors