def RV_and_traintest(fullts, TV_ts, method=str, kwrgs_events=None, precursor_ts=None, seed=int, verbosity=1): # Define traintest: df_RVfullts = pd.DataFrame(fullts.values, index=pd.to_datetime(fullts.time.values)) df_RV_ts = pd.DataFrame(TV_ts.values, index=pd.to_datetime(TV_ts.time.values)) if method[:9] == 'ran_strat': kwrgs_events = kwrgs_events TV = classes.RV_class(df_RVfullts, df_RV_ts, kwrgs_events) else: TV = classes.RV_class(df_RVfullts, df_RV_ts) if precursor_ts is not None: # Retrieve same train test split as imported ts path_data = ''.join(precursor_ts[0][1]) df_splits = func_fc.load_hdf5( path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']] test_yrs_imp = functions_pp.get_testyrs(df_splits) df_splits = functions_pp.rand_traintest_years( TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) test_yrs_set = functions_pp.get_testyrs(df_splits) assert (np.equal(test_yrs_imp, test_yrs_set)).all(), "Train test split not equal" else: df_splits = functions_pp.rand_traintest_years( TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) return TV, df_splits
def RV_and_traintest(fullts, TV_ts, method=str, kwrgs_events=None, precursor_ts=None, seed=int, verbosity=1): # Define traintest: df_fullts = pd.DataFrame(fullts.values, index=pd.to_datetime(fullts.time.values)) df_RV_ts = pd.DataFrame(TV_ts.values, index=pd.to_datetime(TV_ts.time.values)) if method[:9] == 'ran_strat' and kwrgs_events is None: # events need to be defined to enable stratified traintest. kwrgs_events = {'event_percentile': 66, 'min_dur' : 1, 'max_break' : 0, 'grouped' : False} if verbosity == 1: print("kwrgs_events not given, creating stratified traintest split " "based on events defined as exceeding the {}th percentile".format( kwrgs_events['event_percentile'])) TV = RV_class(df_fullts, df_RV_ts, kwrgs_events) if precursor_ts is not None: # Retrieve same train test split as imported ts path_data = ''.join(precursor_ts[0][1]) df_splits = func_fc.load_hdf5(path_data)['df_data'].loc[:,['TrainIsTrue', 'RV_mask']] test_yrs_imp = functions_pp.get_testyrs(df_splits) df_splits = functions_pp.rand_traintest_years(TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) test_yrs_set = functions_pp.get_testyrs(df_splits) assert (np.equal(test_yrs_imp, test_yrs_set)).all(), "Train test split not equal" else: df_splits = functions_pp.rand_traintest_years(TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) return TV, df_splits
def import_precur_ts(list_import_ts: List[tuple], df_splits: pd.DataFrame, start_end_date: Tuple[str, str], start_end_year: Tuple[int, int], start_end_TVdate: Tuple[str, str], cols: list = None, precur_aggr: int = 1): ''' list_import_ts has format List[tuples], [(name, path_data)] ''' #%% # df_splits = rg.df_splits splits = df_splits.index.levels[0] orig_traintest = functions_pp.get_testyrs(df_splits) df_data_ext_s = np.zeros((splits.size), dtype=object) counter = 0 for i, (name, path_data) in enumerate(list_import_ts): df_data_e_all = functions_pp.load_hdf5(path_data)['df_data'] if type(df_data_e_all) is pd.Series: df_data_e_all = pd.DataFrame(df_data_e_all) df_data_e_all = df_data_e_all.iloc[:, :] # not sure why needed if cols is None: cols = list( df_data_e_all.columns[(df_data_e_all.dtypes != bool).values]) elif type(cols) is str: cols = [cols] if hasattr(df_data_e_all.index, 'levels'): dates_subset = core_pp.get_subdates(df_data_e_all.loc[0].index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[pd.IndexSlice[:, dates_subset], :] else: dates_subset = core_pp.get_subdates(df_data_e_all.index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[dates_subset] if 'TrainIsTrue' in df_data_e_all.columns: _c = [ k for k in df_splits.columns if k in ['TrainIsTrue', 'RV_mask'] ] # check if traintest split is correct ext_traintest = functions_pp.get_testyrs(df_data_e_all[_c]) _check_traintest = all( np.equal(core_pp.flatten(ext_traintest), core_pp.flatten(orig_traintest))) assert _check_traintest, ( 'Train test years of df_splits are not the ' 'same as imported timeseries') for s in range(splits.size): if 'TrainIsTrue' in df_data_e_all.columns: df_data_e = df_data_e_all.loc[s] else: df_data_e = df_data_e_all df_data_ext_s[s] = df_data_e[cols] tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days if precur_aggr != tfreq_date_e: try: df_data_ext_s[s] = functions_pp.time_mean_bins( df_data_ext_s[s], precur_aggr, start_end_date, start_end_year, start_end_TVdate=start_end_TVdate)[0] except KeyError as e: print('KeyError captured, likely the requested dates ' 'given by start_end_date and start_end_year are not' 'found in external pandas timeseries.\n{}'.format( str(e))) print(f'loaded in exterinal timeseres: {cols}') if counter == 0: df_data_ext = pd.concat(list(df_data_ext_s), keys=range(splits.size)) else: df_add = pd.concat(list(df_data_ext_s), keys=range(splits.size)) df_data_ext = df_data_ext.merge(df_add, left_index=True, right_index=True) counter += 1 cols = None #%% return df_data_ext
def calculate_corr_maps(ex, map_proj): #%% # ============================================================================= # Load 'exp' dictionairy with information of pre-processed data (variables, paths, filenames, etcetera..) # and add RGCPD/Tigrimate experiment settings # ============================================================================= # Response Variable is what we want to predict RV = ex[ex['RV_name']] ex['time_cycle'] = RV.dates[ RV.dates.year == RV.startyear].size # time-cycle of data. total timesteps in one year ex['time_range_all'] = [0, RV.dates.size] #================================================================================== # Start of experiment #================================================================================== # Define traintest: df_RVfullts = pd.DataFrame(RV.RVfullts.values, index=pd.to_datetime(RV.RVfullts.time.values)) df_RV_ts = pd.DataFrame(RV.RV_ts.values, index=pd.to_datetime(RV.RV_ts.time.values)) if ex['method'][:9] == 'ran_strat': kwrgs_events = ex['kwrgs_events'] RV = func_fc.RV_class(df_RVfullts, df_RV_ts, kwrgs_events) else: RV = func_fc.RV_class(df_RVfullts, df_RV_ts) if ex['import_prec_ts']: # Retrieve same train test split as imported ts path_data = ''.join(ex['precursor_ts'][0][1]) df_splits = func_fc.load_hdf5( path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']] test_yrs = functions_pp.get_testyrs(df_splits) df_splits, ex = functions_pp.rand_traintest_years(RV, ex, test_yrs) assert (np.equal(test_yrs, ex['tested_yrs'])).all(), "Train test split not equal" else: df_splits, ex = functions_pp.rand_traintest_years(RV, ex) # ============================================================================= # 2) DEFINE PRECURSOS COMMUNITIES: # ============================================================================= # - calculate and plot pattern correltion for differnt fields # - create time-series over these regions #===================================================================================== outdic_actors = dict() class act: def __init__(self, name, corr_xr, precur_arr): self.name = var self.corr_xr = corr_xr self.precur_arr = precur_arr self.lat_grid = precur_arr.latitude.values self.lon_grid = precur_arr.longitude.values self.area_grid = rgcpd.get_area(precur_arr) self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0]) allvar = ex['vars'][0] # list of all variable names for var in allvar[ex['excludeRV']:]: # loop over all variables actor = ex[var] #=========================================== # 3c) Precursor field #=========================================== file_path = os.path.join(actor.path_pp, actor.filename_pp) precur_arr = functions_pp.import_ds_timemeanbins(file_path, ex) # precur_arr = rgcpd.convert_longitude(precur_arr, 'only_east') # ============================================================================= # Calculate correlation # ============================================================================= corr_xr = rgcpd.calc_corr_coeffs_new(precur_arr, RV, ex) # ============================================================================= # Cluster into precursor regions # ============================================================================= actor = act(var, corr_xr, precur_arr) actor, ex = rgcpd.cluster_DBSCAN_regions(actor, ex) if np.isnan(actor.prec_labels.values).all() == False: rgcpd.plot_regs_xarray(actor.prec_labels.copy(), ex) outdic_actors[var] = actor # ============================================================================= # Plot # ============================================================================= if ex['plotin1fig'] == False: plot_maps.plot_corr_maps(corr_xr, corr_xr['mask'], map_proj) fig_filename = '{}_corr_{}_vs_{}'.format( ex['params'], ex['RV_name'], var) + ex['file_type2'] plt.savefig(os.path.join(ex['fig_path'], fig_filename), bbox_inches='tight', dpi=200) if ex['showplot'] == False: plt.close() #%% return ex, outdic_actors