def RV_and_traintest(fullts, TV_ts, method=str, kwrgs_events=None, precursor_ts=None, seed=int, verbosity=1): # Define traintest: df_RVfullts = pd.DataFrame(fullts.values, index=pd.to_datetime(fullts.time.values)) df_RV_ts = pd.DataFrame(TV_ts.values, index=pd.to_datetime(TV_ts.time.values)) if method[:9] == 'ran_strat': kwrgs_events = kwrgs_events TV = classes.RV_class(df_RVfullts, df_RV_ts, kwrgs_events) else: TV = classes.RV_class(df_RVfullts, df_RV_ts) if precursor_ts is not None: # Retrieve same train test split as imported ts path_data = ''.join(precursor_ts[0][1]) df_splits = func_fc.load_hdf5( path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']] test_yrs_imp = functions_pp.get_testyrs(df_splits) df_splits = functions_pp.rand_traintest_years( TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) test_yrs_set = functions_pp.get_testyrs(df_splits) assert (np.equal(test_yrs_imp, test_yrs_set)).all(), "Train test split not equal" else: df_splits = functions_pp.rand_traintest_years( TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) return TV, df_splits
def RV_and_traintest(fullts, TV_ts, method=str, kwrgs_events=None, precursor_ts=None, seed=int, verbosity=1): # Define traintest: df_fullts = pd.DataFrame(fullts.values, index=pd.to_datetime(fullts.time.values)) df_RV_ts = pd.DataFrame(TV_ts.values, index=pd.to_datetime(TV_ts.time.values)) if method[:9] == 'ran_strat' and kwrgs_events is None: # events need to be defined to enable stratified traintest. kwrgs_events = {'event_percentile': 66, 'min_dur' : 1, 'max_break' : 0, 'grouped' : False} if verbosity == 1: print("kwrgs_events not given, creating stratified traintest split " "based on events defined as exceeding the {}th percentile".format( kwrgs_events['event_percentile'])) TV = RV_class(df_fullts, df_RV_ts, kwrgs_events) if precursor_ts is not None: # Retrieve same train test split as imported ts path_data = ''.join(precursor_ts[0][1]) df_splits = func_fc.load_hdf5(path_data)['df_data'].loc[:,['TrainIsTrue', 'RV_mask']] test_yrs_imp = functions_pp.get_testyrs(df_splits) df_splits = functions_pp.rand_traintest_years(TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) test_yrs_set = functions_pp.get_testyrs(df_splits) assert (np.equal(test_yrs_imp, test_yrs_set)).all(), "Train test split not equal" else: df_splits = functions_pp.rand_traintest_years(TV, method=method, seed=seed, kwrgs_events=kwrgs_events, verb=verbosity) return TV, df_splits
def ENSO_34(file_path, ex, df_splits=None): #%% # file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc' ''' See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf ''' if df_splits is None: RV = ex[ex['RV_name']] df_splits, ex = functions_pp.rand_traintest_years(RV, ex) seldates = None else: seldates = df_splits.loc[0].index kwrgs_pp = { 'selbox': { 'la_min': -5, # select domain in degrees east 'la_max': 5, 'lo_min': -170, 'lo_max': -120 }, 'seldates': seldates } ds = core_pp.import_ds_lazy(file_path, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = functions_pp.time_mean_bins(ds, ex, to_freq=to_freq, seldays='all') ds['time'] = dates dates = pd.to_datetime(ds.time.values) splits = df_splits.index.levels[0] list_splits = [] for s in splits: progress = 100 * (s + 1) / splits.size print(f"\rProgress ENSO traintest set {progress}%)", end="") data = functions_pp.area_weighted(ds).mean(dim=('latitude', 'longitude')) list_splits.append( pd.DataFrame(data=data.values, index=dates, columns=['0_900_ENSO34'])) df_ENSO = pd.concat(list_splits, axis=0, keys=splits) #%% return df_ENSO
def _create_new_traintest_split(df_data, method='random9', seed=1, kwrgs_events=None): # insert fake train test split to make RV df_data = pd.concat([df_data], axis=0, keys=[0]) RV = df_data_to_RV(df_data, kwrgs_events=kwrgs_events) df_data = df_data.loc[0][df_data.loc[0]['TrainIsTrue'].values] df_data = df_data.drop(['TrainIsTrue', 'RV_mask'], axis=1) # create CV inside training set df_splits = functions_pp.rand_traintest_years( RV, method=method, seed=seed, kwrgs_events=kwrgs_events) # add Train test info splits = df_splits.index.levels[0] df_data_s = np.zeros((splits.size), dtype=object) for s in splits: df_data_s[s] = pd.merge(df_data, df_splits.loc[s], left_index=True, right_index=True) df_data = pd.concat(list(df_data_s), keys=range(splits.size)) return df_data
key_exp = key + ' ' * expand printline = '\'{}\'\t\t{}'.format(key_exp, ex[key]) print(printline) printset() #%% # ============================================================================= # Run code with ex settings # ============================================================================= #ex['lags'] = np.array([0]) ; ex['method'] = 'no_train_test_split' ; today = datetime.datetime.today().strftime("%d-%m-%y_%Hhr") df_splits = functions_pp.rand_traintest_years(RV, method=ex['method'], seed=ex['seed'], kwrgs_events=ex['kwrgs_events']) #%% kwrgs_CPPA = { 'perc_yrs_out': ex['perc_yrs_out'], 'days_before': ex['days_before'], 'FCP_thres': ex['FCP_thres'], 'SCM_percentile_thres': ex['SCM_percentile_thres'] } lags_i = np.array([0, 10, 20, 50]) CPPA_prec = func_CPPA.get_robust_precursors(precur_arr, RV, df_splits, lags_i=lags_i, kwrgs_CPPA=kwrgs_CPPA)
def PDO(filename, ex, df_splits=None): #%% ''' PDO is calculated based upon all data points in the training years, Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train) to enable retrieving the PDO timeseries on a subset on the year. It is similarly also projected on the dates_test From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf ''' t0 = time() if df_splits is None: RV = ex[ex['RV_name']] df_splits, ex = functions_pp.rand_traintest_years(RV, ex) kwrgs_pp = { 'selbox': { 'la_min': 20, # select domain in degrees east 'la_max': 65, 'lo_min': 115, 'lo_max': 250 }, 'format_lon': 'only_east' } ds = core_pp.import_ds_lazy(filename, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = functions_pp.time_mean_bins(ds, ex, to_freq=to_freq, seldays='all') ds['time'] = dates dates = pd.to_datetime(ds.time.values) splits = df_splits.index.levels[0] data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size)) PDO_patterns = xr.DataArray( data, coords=[splits, ds.latitude.values, ds.longitude.values], dims=['split', 'latitude', 'longitude']) def PDO_single_split(s, ds, df_splits, PDO_patterns): progress = 100 * (s + 1) / splits.size dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s] ['TrainIsTrue']].index train_yrs = np.unique(dates_train.year) dates_all_train = pd.to_datetime( [d for d in dates if d.year in train_yrs]) ### dates_train_yrs = ### dates_test = df_splits.loc[s]['TrainIsTrue'][ ~df_splits.loc[s]['TrainIsTrue']].index n = dates_train.size r = int(100 * n / df_splits.loc[s].index.size) print( f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)", end="") PDO_pattern, solver, adjust_sign = get_PDO( ds.sel(time=dates_all_train)) data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train), PDO_patterns[s]) data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test), PDO_patterns[s]) df_test = pd.DataFrame(data=data_test.values, index=dates_test, columns=['0_901_PDO']) df_train = pd.DataFrame(data=data_train.values, index=dates_train, columns=['0_901_PDO']) df = pd.concat([df_test, df_train]).sort_index() return (df, PDO_pattern) pool = ProcessPoolExecutor(os.cpu_count() - 1) # amount of cores - 1 futures = [ pool.submit(PDO_single_split, s, ds, df_splits, PDO_patterns) for s in splits ] results = [future.result() for future in futures] list_splits = [r[0] for r in results] time_ = time() - t0 print(time_ / 60) for s in splits: PDO_patterns[s] = results[s][1] df_PDO = pd.concat(list_splits, axis=0, keys=splits) #%% return df_PDO, PDO_patterns
def PDO_temp(filename, ex, df_splits=None): #%% ''' PDO is calculated based upon all data points in the training years, Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train) to enable retrieving the PDO timeseries on a subset on the year. It is similarly also projected on the dates_test. From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices ''' if df_splits is None: RV = ex[ex['RV_name']] df_splits, ex = functions_pp.rand_traintest_years(RV, ex) kwrgs_pp = { 'selbox': { 'la_min': 20, # select domain in degrees east 'la_max': 65, 'lo_min': 115, 'lo_max': 250 }, 'format_lon': 'only_east' } ds = core_pp.import_ds_lazy(filename, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = functions_pp.time_mean_bins(ds, ex, to_freq=to_freq, seldays='all') ds['time'] = dates dates = pd.to_datetime(ds.time.values) splits = df_splits.index.levels[0] data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size)) PDO_patterns = xr.DataArray( data, coords=[splits, ds.latitude.values, ds.longitude.values], dims=['split', 'latitude', 'longitude']) list_splits = [] for s in splits: progress = 100 * (s + 1) / splits.size dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s] ['TrainIsTrue']].index train_yrs = np.unique(dates_train.year) dates_all_train = pd.to_datetime( [d for d in dates if d.year in train_yrs]) dates_test = df_splits.loc[s]['TrainIsTrue'][ ~df_splits.loc[s]['TrainIsTrue']].index n = dates_train.size r = int(100 * n / df_splits.loc[s].index.size) print( f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)", end="") PDO_patterns[s], solver, adjust_sign = get_PDO( ds.sel(time=dates_all_train)) PDO_patterns[s] = PDO_patterns[s].interpolate_na(dim='longitude') data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train), PDO_patterns[s]) data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test), PDO_patterns[s]) df_test = pd.DataFrame(data=data_test.values, index=dates_test, columns=['0_901_PDO']) df_train = pd.DataFrame(data=data_train.values, index=dates_train, columns=['0_901_PDO']) df = pd.concat([df_test, df_train]).sort_index() list_splits.append(df) df_PDO = pd.concat(list_splits, axis=0, keys=splits) #%% return df_PDO
def calculate_corr_maps(ex, map_proj): #%% # ============================================================================= # Load 'exp' dictionairy with information of pre-processed data (variables, paths, filenames, etcetera..) # and add RGCPD/Tigrimate experiment settings # ============================================================================= # Response Variable is what we want to predict RV = ex[ex['RV_name']] ex['time_cycle'] = RV.dates[ RV.dates.year == RV.startyear].size # time-cycle of data. total timesteps in one year ex['time_range_all'] = [0, RV.dates.size] #================================================================================== # Start of experiment #================================================================================== # Define traintest: df_RVfullts = pd.DataFrame(RV.RVfullts.values, index=pd.to_datetime(RV.RVfullts.time.values)) df_RV_ts = pd.DataFrame(RV.RV_ts.values, index=pd.to_datetime(RV.RV_ts.time.values)) if ex['method'][:9] == 'ran_strat': kwrgs_events = ex['kwrgs_events'] RV = func_fc.RV_class(df_RVfullts, df_RV_ts, kwrgs_events) else: RV = func_fc.RV_class(df_RVfullts, df_RV_ts) if ex['import_prec_ts']: # Retrieve same train test split as imported ts path_data = ''.join(ex['precursor_ts'][0][1]) df_splits = func_fc.load_hdf5( path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']] test_yrs = functions_pp.get_testyrs(df_splits) df_splits, ex = functions_pp.rand_traintest_years(RV, ex, test_yrs) assert (np.equal(test_yrs, ex['tested_yrs'])).all(), "Train test split not equal" else: df_splits, ex = functions_pp.rand_traintest_years(RV, ex) # ============================================================================= # 2) DEFINE PRECURSOS COMMUNITIES: # ============================================================================= # - calculate and plot pattern correltion for differnt fields # - create time-series over these regions #===================================================================================== outdic_actors = dict() class act: def __init__(self, name, corr_xr, precur_arr): self.name = var self.corr_xr = corr_xr self.precur_arr = precur_arr self.lat_grid = precur_arr.latitude.values self.lon_grid = precur_arr.longitude.values self.area_grid = rgcpd.get_area(precur_arr) self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0]) allvar = ex['vars'][0] # list of all variable names for var in allvar[ex['excludeRV']:]: # loop over all variables actor = ex[var] #=========================================== # 3c) Precursor field #=========================================== file_path = os.path.join(actor.path_pp, actor.filename_pp) precur_arr = functions_pp.import_ds_timemeanbins(file_path, ex) # precur_arr = rgcpd.convert_longitude(precur_arr, 'only_east') # ============================================================================= # Calculate correlation # ============================================================================= corr_xr = rgcpd.calc_corr_coeffs_new(precur_arr, RV, ex) # ============================================================================= # Cluster into precursor regions # ============================================================================= actor = act(var, corr_xr, precur_arr) actor, ex = rgcpd.cluster_DBSCAN_regions(actor, ex) if np.isnan(actor.prec_labels.values).all() == False: rgcpd.plot_regs_xarray(actor.prec_labels.copy(), ex) outdic_actors[var] = actor # ============================================================================= # Plot # ============================================================================= if ex['plotin1fig'] == False: plot_maps.plot_corr_maps(corr_xr, corr_xr['mask'], map_proj) fig_filename = '{}_corr_{}_vs_{}'.format( ex['params'], ex['RV_name'], var) + ex['file_type2'] plt.savefig(os.path.join(ex['fig_path'], fig_filename), bbox_inches='tight', dpi=200) if ex['showplot'] == False: plt.close() #%% return ex, outdic_actors