def load_files(self, path_hashfile=str, hash_str: str = None): #%% if hash_str is None: hash_str = '{}_a{}_{}_{}'.format(self._name, self.alpha, self.distance_eps, self.min_area_in_degrees2) if path_hashfile is None: path_hashfile = functions_pp.get_download_path() f_name = None for root, dirs, files in os.walk(path_hashfile): for file in files: if re.findall(f'{hash_str}', file): print(f'Found file {file}') f_name = file if f_name is not None: filepath = os.path.join(path_hashfile, f_name) self.ds = core_pp.import_ds_lazy(filepath) self.corr_xr = self.ds['corr_xr'] self.alpha = self.corr_xr.attrs['alpha'] self.FDR_control = bool(self.corr_xr.attrs['FDR_control']) self.precur_arr = self.ds['precur_arr'] # self._tfreq = self.precur_arr.attrs['_tfreq'] if 'prec_labels' in self.ds.variables.keys(): self.prec_labels = self.ds['prec_labels'] self.distance_eps = self.prec_labels.attrs['distance_eps'] self.min_area_in_degrees2 = self.prec_labels.attrs[ 'min_area_in_degrees2'] self.group_lag = bool(self.prec_labels.attrs['group_lag']) self.group_split = bool(self.prec_labels.attrs['group_split']) loaded = True else: print('No file that matches the hash_str or instance settings in ' f'folder {path_hashfile}') loaded = False return loaded
def ds_oos_lindetrend(dsclust, df_splits, path): kwrgs_NaN_handling = { 'missing_data_ts_to_nan': False, 'extra_NaN_limit': False, 'inter_method': False, 'final_NaN_to_clim': False } years = list(range(1950, 2020)) selbox = [253, 290, 28, 52] ds_raw = core_pp.import_ds_lazy( raw_filename, var='variable', selbox=selbox, kwrgs_NaN_handling=kwrgs_NaN_handling).rename({'z': 'time'}) ds_raw.name = 'Soy_Yield' ds_raw['time'] = pd.to_datetime( [f'{y+1949}-01-01' for y in ds_raw.time.values]) ds_raw = ds_raw.sel(time=core_pp.get_oneyr(ds_raw, *years)) label = int(target_dataset.split('__')[-1]) clusmask = dsclust['xrclustered'] == label ds_raw = ds_raw.where(clusmask) ds_out = utils_paper3.detrend_oos_3d(ds_raw, min_length=30, df_splits=df_splits, standardize=True, path=path) return ds_out
def spatial_mean_clusters(var_filename, xrclust, kwrgs_load: dict = {}): #%% if type(var_filename) is str: xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_load) elif type(var_filename) is xr.DataArray: xarray = var_filename else: raise TypeError('Give var_filename as str or xr.DataArray') labels = xrclust.values nparray = xarray.values track_names = [] area_grid = find_precursors.get_area(xarray) regions_for_ts = list(np.unique(labels[~np.isnan(labels)])) a_wghts = area_grid / area_grid.mean() # this array will be the time series for each feature ts_clusters = np.zeros((xarray.shape[0], len(regions_for_ts))) # calculate area-weighted mean over labels for r in regions_for_ts: track_names.append(int(r)) idx = regions_for_ts.index(r) # start with empty lonlat array B = np.zeros(xrclust.shape) # Mask everything except region of interest B[labels == r] = 1 # Calculates how values inside region vary over time ts_clusters[:, idx] = np.nanmean(nparray[:, B == 1] * a_wghts[B == 1], axis=1) xrts = xr.DataArray(ts_clusters.T, coords={ 'cluster': track_names, 'time': xarray.time }, dims=['cluster', 'time']) # extract selected setting for ts dims = list(xrclust.coords.keys()) standard_dim = ['latitude', 'longitude', 'time', 'mask', 'cluster'] dims = [d for d in dims if d not in standard_dim] if 'n_clusters' in dims: idx = dims.index('n_clusters') dims[idx] = 'ncl' xrclust = xrclust.rename({'n_clusters': dims[idx]}).copy() var1 = str(xrclust[dims[0]]) dim1 = dims[0] xrts.attrs[dim1] = var1 xrclust.attrs[dim1] = var1 xrclust = xrclust.drop(dim1) if len(dims) == 2: var2 = int(xrclust[dims[1]]) dim2 = dims[1] xrts.attrs[dim2] = var2 xrclust.attrs[dim2] = var2 xrclust = xrclust.drop(dim2) ds = xr.Dataset({'xrclustered': xrclust, 'ts': xrts}) #%% return ds
def get_spatial_ma(var_filename, mask=None, kwrgs_l_spatial: dict = {}): ''' var_filename must be 3d netcdf file with only one variable mask can be nc file containing only a mask, or a latlon box in format [west_lon, east_lon, south_lat, north_lat] in format in common west-east degrees Is build upon sklean clustering. Techniques available are listed in sklearn.cluster.__dict__, e.g. KMeans, or AgglomerativeClustering, kwrgs are techinque dependend, see sklearn docs. ''' if mask is None: xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial) lons = xarray.longitude.values lats = xarray.latitude.values mask = [min(lons), max(lons), min(lats), max(lats)] print(f'no mask given, entire array of box {mask} will be clustered') if type(mask) is str: xrmask = core_pp.import_ds_lazy(mask, **kwrgs_l_spatial) if xrmask.attrs['is_DataArray'] == False: variables = list(xrmask.variables.keys()) strvars = [' {} '.format(var) for var in variables] common_fields = ' time time_bnds longitude latitude lev lon lat level ' var = [var for var in strvars if var not in common_fields] if len(var) != 0: var = var[0].replace(' ', '') npmask = xrmask[var].values else: npmask = xrmask.values elif type(mask) is list or type(mask) is tuple: xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial) selregion = core_pp.import_ds_lazy(var_filename, selbox=mask) lons_mask = list(selregion.longitude.values) lon_mask = [ True if l in lons_mask else False for l in xarray.longitude ] lats_mask = list(selregion.latitude.values) lat_mask = [True if l in lats_mask else False for l in xarray.latitude] npmasklon = np.meshgrid(lon_mask, lat_mask)[0] npmasklat = np.meshgrid(lon_mask, lat_mask)[1] npmask = np.logical_and(npmasklon, npmasklat) elif type(mask) is type(xr.DataArray([0])): # lo_min = float(mask.longitude.min()); lo_max = float(mask.longitude.max()) # la_min = float(mask.latitude.min()); la_max = float(mask.latitude.max()) # selbox = (lo_min, lo_max, la_min, la_max) # selregion = core_pp.import_ds_lazy(var_filename, selbox=selbox) # selregion = selregion.where(mask) npmask = mask.values return npmask
def get_spatial_ma(var_filename=str, mask=None, kwrgs_l_spatial: dict = {}): ''' var_filename must be 3d netcdf file with only one variable mask can be nc file containing only a mask, or a latlon box in format [west_lon, east_lon, south_lat, north_lat] in format in common west-east degrees ''' if mask is None: xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial) lons = xarray.longitude.values lats = xarray.latitude.values mask = [min(lons), max(lons), min(lats), max(lats)] print(f'Loaded array with coordinates {mask}') if type(mask) is str: xrmask = core_pp.import_ds_lazy(mask, **kwrgs_l_spatial) if xrmask.attrs['is_DataArray'] == False: variables = list(xrmask.variables.keys()) strvars = [' {} '.format(var) for var in variables] common_fields = ' time time_bnds longitude latitude lev lon lat level ' var = [var for var in strvars if var not in common_fields] if len(var) != 0: var = var[0].replace(' ', '') npmask = xrmask[var].values else: npmask = xrmask.values # creates a subdomain within a larger domain elif type(mask) is list or type(mask) is tuple: xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial) selregion = core_pp.import_ds_lazy(var_filename, selbox=mask) lons_mask = list(selregion.longitude.values) lon_mask = [ True if l in lons_mask else False for l in xarray.longitude ] lats_mask = list(selregion.latitude.values) lat_mask = [True if l in lats_mask else False for l in xarray.latitude] npmasklon = np.meshgrid(lon_mask, lat_mask)[0] npmasklat = np.meshgrid(lon_mask, lat_mask)[1] npmask = np.logical_and(npmasklon, npmasklat) elif type(mask) is type(xr.DataArray([0])): # lo_min = float(mask.longitude.min()); lo_max = float(mask.longitude.max()) # la_min = float(mask.latitude.min()); la_max = float(mask.latitude.max()) # selbox = (lo_min, lo_max, la_min, la_max) # selregion = core_pp.import_ds_lazy(var_filename, selbox=selbox) # selregion = selregion.where(mask) npmask = mask.values return npmask
def update_dates(cls, ex): import os file_path = os.path.join(cls.path_pp, cls.filename_pp) kwrgs_pp = {'selbox':ex['selbox'], 'loadleap':False } ds = core_pp.import_ds_lazy(file_path, **kwrgs_pp) temporal_freq = pd.Timedelta((ds['time'][1] - ds['time'][0]).values) cls.dates = pd.to_datetime(ds['time'].values) cls.temporal_freq = '{}days'.format(temporal_freq.days) return cls, ex
def ENSO_34(file_path, ex, df_splits=None): #%% # file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc' ''' See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf ''' if df_splits is None: RV = ex[ex['RV_name']] df_splits, ex = functions_pp.rand_traintest_years(RV, ex) seldates = None else: seldates = df_splits.loc[0].index kwrgs_pp = { 'selbox': { 'la_min': -5, # select domain in degrees east 'la_max': 5, 'lo_min': -170, 'lo_max': -120 }, 'seldates': seldates } ds = core_pp.import_ds_lazy(file_path, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = functions_pp.time_mean_bins(ds, ex, to_freq=to_freq, seldays='all') ds['time'] = dates dates = pd.to_datetime(ds.time.values) splits = df_splits.index.levels[0] list_splits = [] for s in splits: progress = 100 * (s + 1) / splits.size print(f"\rProgress ENSO traintest set {progress}%)", end="") data = functions_pp.area_weighted(ds).mean(dim=('latitude', 'longitude')) list_splits.append( pd.DataFrame(data=data.values, index=dates, columns=['0_900_ENSO34'])) df_ENSO = pd.concat(list_splits, axis=0, keys=splits) #%% return df_ENSO
def load_precursor(ex): #%% dates_all = ex['dates_all'] # ============================================================================= # Load Precursor # ============================================================================= prec_filename = os.path.join(ex['path_pp'], ex['filename_precur']) # if ex['datafolder'] == 'EC': # try: # datesRV = func_CPPA.make_datestr(dates_all, ex, # ex['startyear'], ex['endyear'], lpyr=False) # dates_prec = subset_dates(datesRV, ex) ## varfullgl = func_CPPA.import_ds_lazy(prec_filename, ex, seldates=dates_prec) # except: # datesRV = func_CPPA.make_datestr(dates_all, ex, # ex['startyear'], ex['endyear'], lpyr=True) # dates_prec = subset_dates(datesRV, ex) # varfullgl = func_CPPA.import_ds_lazy(prec_filename, ex, seldates=dates_prec) # else: Prec_reg = functions_pp.import_ds_timemeanbins(prec_filename, ex['tfreq'], loadleap=True, to_xarr=False, seldates=ex['dates_all']) Prec_reg = core_pp.convert_longitude(Prec_reg, 'only_east') if ex['add_lsm']: kwrgs_2d = {'selbox': ex['selbox'], 'format_lon': 'only_east'} lsm_filename = os.path.join(ex['path_mask'], ex['mask_file']) lsm = core_pp.import_ds_lazy(lsm_filename, **kwrgs_2d) Prec_reg['lsm'] = (('latitude', 'longitude'), (lsm < 0.3).values) Prec_reg = Prec_reg.where(Prec_reg['lsm']) if 'exclude_yrs' in ex.keys(): if len(ex['exclude_yrs']) != 0: print('excluding yr(s): {} from analysis'.format( ex['exclude_yrs'])) all_yrs = np.unique(dates_all.year) yrs_keep = [y for y in all_yrs if y not in ex['exclude_yrs']] idx_yrs = [ i for i in np.arange(dates_all.year.size) if dates_all.year[i] in yrs_keep ] # dates_all = dates_all[idx_yrs] mask_all = np.zeros(dates_all.size, dtype=bool) mask_all[idx_yrs] = True dates_excl_yrs = dates_all[mask_all] Prec_reg = Prec_reg.sel(time=dates_excl_yrs) #%% return Prec_reg, ex
def check(rg, list_of_name_path, cluster_nr): import matplotlib.pyplot as plt import core_pp t2m_path = list_of_name_path[0][1] t2m = core_pp.import_ds_lazy(t2m_path, format_lon='west_east') t2m_clus = t2m.sel(cluster=cluster_nr) sst_path = list_of_name_path[1][1] sst = core_pp.import_ds_lazy(sst_path, format_lon='west_east') swvl12_path = list_of_name_path[2][1] swvl12 = core_pp.import_ds_lazy(swvl12_path, format_lon='west_east') #example time series plot for first cluster plt.figure() t2m_clus.ts.plot() #check plot for sst plt.figure() sst[0].plot() #check plot for swvl plt.figure() swvl12[0].plot() # Check plot of clusters # if TVpath contains the xr.DataArray that is clustered beforehand, we can have a look at the spatial regions. ds = rg.get_clust(format_lon='west_east') fig = plot_maps.plot_labels(ds['xrclustered'], kwrgs_plot={ 'col_dim': 'n_clusters', 'title': 'Hierarchical Clustering', 'cbar_tick_dict': { 'labelsize': 8 }, 'add_cfeature': 'BORDERS' })
def dendogram_clustering(var_filename, mask=None, q=70, clustermethodkey='AgglomerativeClustering', kwrgs={'n_clusters': 3}): xarray = core_pp.import_ds_lazy(var_filename) npmask = get_spatial_ma(var_filename, mask) xarray = binary_occurences_quantile(xarray, q=q) xrclustered, results = skclustering(xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs) return xrclustered, results
def check_pp_done(cls, ex): #%% ''' Check if pre processed ncdf already exists ''' # ============================================================================= # load dataset lazy # ============================================================================= import pandas as pd filename = os.path.join(ex['path_raw'], cls.filename) kwrgs_pp = {'selbox':ex['selbox'], 'loadleap':False} ds = core_pp.import_ds_lazy(filename, **kwrgs_pp) dates = pd.to_datetime(ds['time'].values) # ============================================================================= # get time series that you request # ============================================================================= # dates = timeseries_tofit_bins(ds, ex, seldays='part')[1] start_day = get_oneyr(dates)[0] end_day = get_oneyr(dates)[-1] # ============================================================================= # give appropriate name to output file # ============================================================================= outfilename = cls.filename[:-3]+'.nc' # outfilename = outfilename.replace('daily', 'dt-{}days'.format(1)) months = dict( {1:'jan',2:'feb',3:'mar',4:'apr',5:'may',6:'jun',7:'jul', 8:'aug',9:'sep',10:'okt',11:'nov',12:'dec' } ) if ex['input_freq'] == 'daily': startdatestr = '_{}{}_'.format(start_day.day, months[start_day.month]) enddatestr = '_{}{}_'.format(end_day.day, months[end_day.month]) elif ex['input_freq'] == 'monthly': startdatestr = '_{}_'.format(months[start_day.month]) enddatestr = '_{}_'.format(months[end_day.month]) outfilename = outfilename.replace('_{}_'.format(1), startdatestr) outfilename = outfilename.replace('_{}_'.format(12), enddatestr) cls.filename_pp = outfilename cls.path_pp = ex['path_pp'] outfile = os.path.join(ex['path_pp'], outfilename) cls.dates_fit_tfreq = dates print('output file of pp will be saved as: \n' + outfile) #%% return outfile, cls, ex
def regrid_array(xr_or_filestr, to_grid, periodic=False): import functions_pp if type(xr_or_filestr) == str: xarray = core_pp.import_ds_lazy(xr_or_filestr) plot_maps.plot_corr_maps(xarray[0]) xr_regrid = functions_pp.regrid_xarray(xarray, to_grid, periodic=periodic) plot_maps.plot_corr_maps(xr_regrid[0]) else: plot_maps.plot_labels(xr_or_filestr) xr_regrid = functions_pp.regrid_xarray(xr_or_filestr, to_grid, periodic=periodic) plot_maps.plot_labels(xr_regrid) plot_maps.plot_labels(xr_regrid.where(xr_regrid.values == 3)) return xr_regrid
def import_ds_timemeanbins(file_path, ex, loadleap=False, to_xarr=True, seldates=None): kwrgs_pp = {'selbox':ex['selbox'], 'loadleap':loadleap, 'seldates':seldates } ds = core_pp.import_ds_lazy(file_path, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = time_mean_bins(ds, ex, to_freq=to_freq, seldays='part') ds['time'] = dates # print('temporal frequency \'dt\' is: \n{}'.format(dates[1]- dates[0])) if to_xarr: if type(ds) == type(xr.DataArray(data=[0])): ds = ds.squeeze() else: ds = ds.to_array().squeeze() return ds
# ## In[ ]: # #rg.list_precur_pp var_filename = '/Users/semvijverberg/surfdrive/Data_EC/input_pp/tas_2000-2159_1jan_31dec_daily_1.125deg.nc' LSM = '/Users/semvijverberg/surfdrive/Data_EC/input_raw/mask_North_America_1.125deg.nc' #%% import make_country_mask selbox = (225, 300, 20, 70) xarray, Country = make_country_mask.create_mask(var_filename, kwrgs_load={'selbox': selbox}, level='Countries') mask_US = xarray.values == Country.US lsm = core_pp.import_ds_lazy(LSM, selbox=selbox) mask_US = np.logical_and(mask_US, (lsm > .3).values) xr_mask = xarray.where(mask_US) xr_mask.values[mask_US] = 1 xr_mask = xrmask_by_latlon(xr_mask, lonmin=237) xr_mask = xrmask_by_latlon(xr_mask, lonmin=238, latmin=39) xr_mask = xrmask_by_latlon(xr_mask, lonmin=239, latmin=38) xr_mask = xrmask_by_latlon(xr_mask, lonmin=240, latmin=36) plot_maps.plot_labels(xr_mask) # In[9]: # ============================================================================= # Clustering co-occurence of anomalies # ============================================================================= q = [80, 85, 90, 95] n_clusters = [2, 3, 4, 5, 6, 7, 8]
#%% import make_country_mask # xarray, Country = make_country_mask.create_mask(var_filename, kwrgs_load={'selbox':selbox}, level='Countries') # mask_US = (xarray.values == Country.US) # mask_US = make_country_mask.binary_erosion(mask_US) # mask_US = make_country_mask.binary_erosion(mask_US) # mask_US = make_country_mask.binary_opening(mask_US) # xr_mask = xarray.where(mask_US) # xr_mask.values[mask_US] = 1 # xr_mask = cl.mask_latlon(xr_mask, latmax=63, lonmax=270) selbox = (232, 295, 25, 50) xr_mask = core_pp.import_ds_lazy( user_dir + '/surfdrive/Scripts/rasterio/mask_North_America_0.25deg_orig.nc', var='lsm', selbox=selbox) xr_mask.values = make_country_mask.binary_erosion(xr_mask.values) plot_maps.plot_labels(xr_mask) # In[9]: # ============================================================================= # Clustering co-occurence of anomalies # ============================================================================= q = [80, 85, 90, 95] n_clusters = [2, 3, 4, 5, 6, 7, 8] tfreq = 1 from time import time t0 = time() xrclustered, results = cl.dendogram_clustering(var_filename,
def plot_ss2(agg_level, skillscores, col_wrap, metric=None): #%% import find_precursors cluster_nc_path = get_list_of_name_path(agg_level, 1)[0][1] ds = core_pp.import_ds_lazy(cluster_nc_path, format_lon='west_east') cluster_labels_org = ds.coords['cluster'] ds = ds['xrclustered'] #create list of skill score names skillscores_multi_idx = skillscores.index.levels ss_list = [] for i in skillscores_multi_idx[1:][0]: for j in skillscores_multi_idx[1:][1]: ss_name = '{}_{}'.format(i, j) ss_list.append(ss_name) if metric is not None: #only apply single metric ss_list = [metric] #add dimensions and coordinates xr_score = ds.copy() xr_score.attrs = {} list_xr = [xr_score.copy().expand_dims('metric', axis=0) for m in ss_list] xr_score = xr.concat(list_xr, dim='metric') xr_score['metric'] = ('metric', ss_list) list_xr = [ xr_score.copy().expand_dims('target_month', axis=0) for m in skillscores.columns ] xr_score = xr.concat(list_xr, dim='target_month') xr_score['target_month'] = ('target_month', skillscores.columns) #replace labels with skillscores for metric_nr, metric in enumerate(xr_score.metric.values): test_or_train = metric[:metric.find("_")] ss = metric[metric.find("_") + 1:] for month_nr, month in enumerate(xr_score.target_month.values): #slice over metric, month in skill score df metric_cluster_dict = skillscores[month].xs( (test_or_train, ss), level=(1, 2)).to_dict() #replace cluster_labels with their skill score cluster_labels_new = [ metric_cluster_dict.get(x, x) for x in cluster_labels_org.values ] #set all non replaced values of cluster labels to np.nan cluster_labels_new = [ np.nan if isinstance(x, np.int32) else x for x in cluster_labels_new ] #replace values xarr_labels_to_replace = ds xr_score[month_nr, metric_nr] = find_precursors.view_or_replace_labels( xarr_labels_to_replace, regions=list(cluster_labels_org.values), replacement_labels=cluster_labels_new) #set col wrap and subtitles col_wrap = col_wrap #int determines nr of cols import math subtitles = [[] for i in range( int(math.ceil(xr_score.target_month.values.size / col_wrap)))] total_nr_fields = col_wrap * len(subtitles) j = -1 for i, month in enumerate(xr_score.target_month.values): if i % col_wrap == 0: j += 1 subtitles[j].append('{}, {}'.format(month, metric)) if i == max( list(enumerate(xr_score.target_month.values)) )[0] and total_nr_fields > xr_score.target_month.values.size: for k in range(total_nr_fields - xr_score.target_month.values.size): subtitles[j].append('0') #plot fig = plot_maps.plot_corr_maps(xr_score, col_dim='target_month', row_dim='metric', size=4, clevels=np.arange(-.5, 0.51, .1), cbar_vert=-0.1, hspace=-0.2, subtitles=subtitles, col_wrap=col_wrap) #%% return fig
def sklearn_clustering(var_filename, mask=None, kwrgs_load={}, clustermethodkey='DBSCAN', kwrgs_clust={'eps': 600}): if 'selbox' in kwrgs_load.keys(): if kwrgs_load['selbox'] is not None: mask = kwrgs_load.pop('selbox') print( 'mask overwritten with selbox list. Both selbox and mask are given.' 'Both adapt the domain over which to cluster') kwrgs_l_spatial = {} # kwrgs affecting spatial extent/format if 'format_lon' in kwrgs_load.keys(): kwrgs_l_spatial['format_lon'] = kwrgs_load['format_lon'] xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial) npmask = get_spatial_ma(var_filename, mask, kwrgs_l_spatial=kwrgs_l_spatial) kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list} [ kwrgs_loop.update({k: i}) for k, i in kwrgs_load.items() if type(i) == list ] if len(kwrgs_loop) == 1: # insert fake axes kwrgs_loop['fake'] = [0] if len(kwrgs_loop) >= 1: new_coords = [] xrclustered = xarray[0].drop('time') for k, list_v in kwrgs_loop.items(): # in alphabetical order new_coords.append(k) dim_coords = {str(k): list_v} xrclustered = xrclustered.expand_dims(dim_coords).copy() new_coords = [ d for d in xrclustered.dims if d not in ['latitude', 'longitude'] ] results = [] first_loop = kwrgs_loop[new_coords[0]] second_loop = kwrgs_loop[new_coords[1]] for i, v1 in enumerate(first_loop): for j, v2 in enumerate(second_loop): kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2) kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2) print( f"\rclustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2} ", end="") xarray = functions_pp.import_ds_timemeanbins( var_filename, **kwrgs_l) xrclustered[i, j], result = skclustering( xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs) results.append(result) if 'fake' in new_coords: xrclustered = xrclustered.squeeze().drop('fake').copy() else: xrclustered, results = skclustering(xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs_clust) xrclustered.attrs['method'] = clustermethodkey xrclustered.attrs['kwrgs'] = str(kwrgs_clust) xrclustered.attrs['target'] = f'{xarray.name}' if 'hash' not in xrclustered.attrs.keys(): xrclustered.attrs['hash'] = uuid.uuid4().hex[:5] return xrclustered, results
rg = RGCPD(list_of_name_path=list_of_name_path, list_for_MI=list_for_MI, list_import_ts=None, start_end_TVdate=start_end_TVdate, start_end_date=start_end_date, start_end_year=start_end_year, path_outmain=path_out_main, append_pathsub=append_main) rg.pp_precursors() rg.pp_TV(anomaly=False, detrend=True) rg.traintest(method) ds = core_pp.import_ds_lazy(rg.list_precur_pp[0][1]) season = ds.resample(time='QS-DEC').mean() #%% on post-processed (anomaly, detrended) SST import climate_indices df_ENSO, ENSO_yrs, df_states = climate_indices.ENSO_34(rg.list_precur_pp[0][1], rg.df_splits.copy(), get_ENSO_states=True) cycle = df_states[[f'EN_cycle']].loc[0] print('El Nino yrs', list(cycle[cycle=='EN0'].dropna().index.year)) cycle = df_states[[f'LN_cycle']].loc[0] print('La Nina yrs', list(cycle[cycle=='LN0'].dropna().index.year)) #%% Composites of Anderson 2017 ENSO states for title in ['EN-1', 'EN0', 'EN+1', 'LN-1', 'LN0', 'LN+1']:
def dendogram_clustering(var_filename=str, mask=None, kwrgs_load={}, clustermethodkey='AgglomerativeClustering', kwrgs_clust={ 'q': 70, 'n_clusters': 3 }, n_cpu=None): ''' Parameters ---------- var_filename : str path to pre-processed Netcdf file. mask : [xr.DataArray, path to netcdf file with mask, list or tuple], optional See get_spatial_ma?. The default is None. kwrgs_load : TYPE, optional See functions_pp.import_ds_timemeanbins? for parameters. The default is {}. clustermethodkey : TYPE, optional See cluster.cluster.__dict__ for all sklean cluster algorithms. The default is 'AgglomerativeClustering'. kwrgs_clust : dict, optional Note that q is in percentiles, i.e. 50 refers to the median. The default is {'q':70, 'n_clusters':3}. Yields ------ xrclustered : xr.DataArray results : list of sklearn cluster method instances. ''' if n_cpu is None: n_cpu = multiprocessing.cpu_count() - 1 if 'selbox' in kwrgs_load.keys(): if kwrgs_load['selbox'] is not None: mask = kwrgs_load.pop('selbox') print('mask overwritten because both selbox and mask are given.' 'both adapt the domain over which to cluster') kwrgs_l_spatial = {} # kwrgs affecting spatial extent/format if 'format_lon' in kwrgs_load.keys(): kwrgs_l_spatial['format_lon'] = kwrgs_load['format_lon'] xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial) npmask = get_spatial_ma(var_filename, mask, kwrgs_l_spatial=kwrgs_l_spatial) kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list} kwrgs_loop_load = {k: i for k, i in kwrgs_load.items() if type(i) == list} [kwrgs_loop.update({k: i}) for k, i in kwrgs_loop_load.items()] q = kwrgs_clust['q'] if len(kwrgs_loop) == 1: # insert fake axes kwrgs_loop['fake'] = [0] if len(kwrgs_loop) >= 1: new_coords = [] xrclustered = xarray[0].drop('time') for k, list_v in kwrgs_loop.items(): # in alphabetical order new_coords.append(k) dim_coords = {str(k): list_v} xrclustered = xrclustered.expand_dims(dim_coords).copy() new_coords = [ d for d in xrclustered.dims if d not in ['latitude', 'longitude'] ] results = [] first_loop = kwrgs_loop[new_coords[0]] second_loop = kwrgs_loop[new_coords[1]] comb = [[v1, v2] for v1, v2 in product(first_loop, second_loop)] def generator(var_filename, xarray, comb, new_coords, kwrgs_clust, kwrgs_load, q): for v1, v2 in comb: kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2) kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2) del kwrgs['q'] print( f"clustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2}") yield kwrgs, kwrgs_l, v1, v2 def execute_to_dict(var_filename, npmask, v1, v2, q, clustermethodkey, kwrgs, kwrgs_l): # if reload: # some param has been adjusted xarray_ts = functions_pp.import_ds_timemeanbins( var_filename, **kwrgs_l) if type(q) is int: xarray = binary_occurences_quantile(xarray_ts, q=q) if type(q) is list: xarray = binary_occurences_quantile(xarray_ts, q=v2) xrclusteredij, result = skclustering( xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs) return {f'{v1}..{v2}': (xrclusteredij, result)} if n_cpu > 1: futures = [] for kwrgs, kwrgs_l, v1, v2 in generator(var_filename, xarray, comb, new_coords, kwrgs_clust, kwrgs_load, q): futures.append( delayed(execute_to_dict)(var_filename, npmask, v1, v2, q, clustermethodkey, kwrgs, kwrgs_l)) output = Parallel(n_jobs=n_cpu, backend='loky')(futures) else: output = [] for kwrgs, kwrgs_l, v1, v2 in generator(var_filename, xarray, comb, new_coords, kwrgs_clust, kwrgs_load, q): output.append( execute_to_dict(var_filename, npmask, v1, v2, q, clustermethodkey, kwrgs, kwrgs_l)) # unpack output when done loop over parameters for run in output: for key, out in run.items(): v1, v2 = float(key.split('..')[0]), float(key.split('..')[1]) i, j = first_loop.index(v1), second_loop.index(v2) xrclustered[i, j] = xr.DataArray( out[0], dims=['latitude', 'longitude'], coords=[xarray.latitude, xarray.longitude]) if 'fake' in new_coords: xrclustered = xrclustered.squeeze().drop('fake').copy() else: del kwrgs_clust['q'] npclustered, results = skclustering(xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs_clust) xrclustered = xr.DataArray(npclustered, dims=['latitude', 'longitude'], coords=[xarray.latitude, xarray.longitude]) print('\n') xrclustered.attrs['method'] = clustermethodkey xrclustered.attrs['kwrgs'] = str(kwrgs_clust) xrclustered.attrs[ 'target'] = f'{xarray.name}_exceedances_of_{q}th_percentile' if 'hash' not in xrclustered.attrs.keys(): xrclustered.attrs['hash'] = uuid.uuid4().hex[:5] return xrclustered, results
# In[3]: rg.pp_precursors() # In[ ]: rg.list_precur_pp var_filename = rg.list_precur_pp[0][1] region = 'USCAnew' #%% import pandas as pd ds = core_pp.import_ds_lazy(var_filename) ds.sel(time=core_pp.get_subdates(pd.to_datetime(ds.time.values), start_end_date=('06-01', '08-31'))).mean( dim='time').plot() #%% if region == 'USCAnew': selbox = (230, 300, 25, 70) TVpath = os.path.join(path_data, 'tfreq15_nc7_dendo_57db0USCA.nc') # np_array_xy = np.array([[-97, 39], [-89, 39], [-82, 40], # [-116,36], [-122,41], [-117,46]]) np_array_xy = np.array([[-96, 36], [-92, 41], [-84, 35], [-84, 41], [-114, 36], [-120, 36], [-122, 44], [-118, 48]]) t, c = 15, 7 # elif region == 'USCA':
def percentile_cluster(var_filename, xrclust, q=75, tailmean=True, selbox=None): xarray = core_pp.import_ds_lazy(var_filename, selbox=selbox) labels = xrclust.values nparray = xarray.values n_t = xarray.time.size track_names = [] area_grid = find_precursors.get_area(xarray) regions_for_ts = list(np.unique(labels[~np.isnan(labels)])) if tailmean: tmp_wgts = (area_grid / area_grid.mean())[:, :] a_wghts = np.tile(tmp_wgts[None, :], (n_t, 1, 1)) else: a_wghts = area_grid / area_grid.mean() # this array will be the time series for each feature ts_clusters = np.zeros((xarray.shape[0], len(regions_for_ts))) # calculate area-weighted mean over labels for r in regions_for_ts: track_names.append(int(r)) idx = regions_for_ts.index(r) # start with empty lonlat array B = np.zeros(xrclust.shape) # Mask everything except region of interest B[labels == r] = 1 # Calculates how values inside region vary over time if tailmean == False: ts_clusters[:, idx] = np.nanpercentile(nparray[:, B == 1] * a_wghts[B == 1], q=q, axis=1) elif tailmean: # calc percentile of space for each timestep, not we will # have a timevarying spatial mask. ts_clusters[:, idx] = np.nanpercentile(nparray[:, B == 1], q=q, axis=1) # take a mean over all gridpoints that pass the percentile instead # of taking the single percentile value of a spatial region mask_B_perc = nparray[:, B == 1] > ts_clusters[:, idx, None] # if unlucky, the amount of gridcells that pass the percentile # value, were not always equal in each timestep. When this happens, # we can no longer reshape the array to (time, space) axis, and thus # we cannot take the mean over time. # check if have same size over time cs_ = [ int(mask_B_perc[t][mask_B_perc[t]].shape[0]) for t in range(n_t) ] if np.unique(cs_).size != 1: # what is the most common size: common_shape = cs_[np.argmax( [cs_.count(v) for v in np.unique(cs_)])] # convert all masks to most common size by randomly # adding/removing a True for t in range(n_t): while mask_B_perc[t][ mask_B_perc[t]].shape[0] < common_shape: mask_B_perc[t][np.argwhere( mask_B_perc[t] == False)[0][0]] = True while mask_B_perc[t][ mask_B_perc[t]].shape[0] > common_shape: mask_B_perc[t][np.argwhere( mask_B_perc[t] == True)[0][0]] = False nptimespacefull = nparray[:, B == 1].reshape(nparray.shape[0], -1) npuppertail = nptimespacefull[mask_B_perc] wghtsuppertail = a_wghts[:, B == 1][mask_B_perc] y = np.nanmean(npuppertail.reshape(n_t,-1) * \ wghtsuppertail.reshape(n_t,-1), axis =1) ts_clusters[:, idx] = y xrts = xr.DataArray(ts_clusters.T, coords={ 'cluster': track_names, 'time': xarray.time }, dims=['cluster', 'time']) return xrts
def PDO_temp(filename, ex, df_splits=None): #%% ''' PDO is calculated based upon all data points in the training years, Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train) to enable retrieving the PDO timeseries on a subset on the year. It is similarly also projected on the dates_test. From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices ''' if df_splits is None: RV = ex[ex['RV_name']] df_splits, ex = functions_pp.rand_traintest_years(RV, ex) kwrgs_pp = { 'selbox': { 'la_min': 20, # select domain in degrees east 'la_max': 65, 'lo_min': 115, 'lo_max': 250 }, 'format_lon': 'only_east' } ds = core_pp.import_ds_lazy(filename, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = functions_pp.time_mean_bins(ds, ex, to_freq=to_freq, seldays='all') ds['time'] = dates dates = pd.to_datetime(ds.time.values) splits = df_splits.index.levels[0] data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size)) PDO_patterns = xr.DataArray( data, coords=[splits, ds.latitude.values, ds.longitude.values], dims=['split', 'latitude', 'longitude']) list_splits = [] for s in splits: progress = 100 * (s + 1) / splits.size dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s] ['TrainIsTrue']].index train_yrs = np.unique(dates_train.year) dates_all_train = pd.to_datetime( [d for d in dates if d.year in train_yrs]) dates_test = df_splits.loc[s]['TrainIsTrue'][ ~df_splits.loc[s]['TrainIsTrue']].index n = dates_train.size r = int(100 * n / df_splits.loc[s].index.size) print( f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)", end="") PDO_patterns[s], solver, adjust_sign = get_PDO( ds.sel(time=dates_all_train)) PDO_patterns[s] = PDO_patterns[s].interpolate_na(dim='longitude') data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train), PDO_patterns[s]) data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test), PDO_patterns[s]) df_test = pd.DataFrame(data=data_test.values, index=dates_test, columns=['0_901_PDO']) df_train = pd.DataFrame(data=data_train.values, index=dates_train, columns=['0_901_PDO']) df = pd.concat([df_test, df_train]).sort_index() list_splits.append(df) df_PDO = pd.concat(list_splits, axis=0, keys=splits) #%% return df_PDO
'y': 1.0, 'fontsize': 18 } } save = True # rg.plot_maps_corr(var='z500', save=save, # min_detect_gc=min_detect_gc, # kwrgs_plot=kwrgs_plot, # append_str=''.join(map(str, z500_green_bb))+TV+str(cluster_label)) z500 = rg.list_for_MI[0] xrvals, xrmask = RGCPD._get_sign_splits_masked(z500.corr_xr, min_detect_gc, z500.corr_xr['mask']) g = plot_maps.plot_corr_maps(xrvals, xrmask, **kwrgs_plot) ds = core_pp.import_ds_lazy(TVpathtemp) xrclustered = find_precursors.view_or_replace_labels(ds['xrclustered'], cluster_label) g.axes[0, 0].contour(xrclustered.longitude, xrclustered.latitude, np.isnan(xrclustered), transform=ccrs.PlateCarree(), levels=[0, 2], linewidths=2, linestyles=['solid'], colors=['white']) filename = os.path.join(rg.path_outsub1, 'z500vsRW_' + ''.join(map(str, z500_green_bb))) g.fig.savefig(filename + '.pdf', bbox_inches='tight') g.fig.savefig(filename + '.jpg', dpi=300, bbox_inches='tight')
kwrgs_load={'selbox': selbox}, level='Countries') if domain == 'USCA': mask_US_CA = np.logical_or(xarray.values == Country.US, xarray.values == Country.CA) elif domain == 'US': mask_US_CA = xarray.values == Country.US # xr_mask = xarray.where(mask_US_CA) xr_mask = xarray.where(make_country_mask.binary_erosion(mask_US_CA)) # xr_mask = xarray.where(make_country_mask.binary_erosion(np.nan_to_num(xr_mask))) xr_mask.values[~np.isnan(xr_mask)] = 1 xr_mask = find_precursors.xrmask_by_latlon(xr_mask, upper_right=(270, 63)) # mask small Western US Island xr_mask = find_precursors.xrmask_by_latlon(xr_mask, bottom_left=(228, 58)) # add Rocky mask geo_surf_height = core_pp.import_ds_lazy( orography, var='z_NON_CDM', selbox=selbox) / 9.81 geo_surf_height = geo_surf_height.drop('time').drop('realization') plot_maps.plot_corr_maps(geo_surf_height, cmap=plt.cm.Oranges, clevels=np.arange(0, 2600, 500)) max_height = 1500 mask_Rockies = geo_surf_height < max_height plot_maps.plot_labels(mask_Rockies) xr_mask = xr_mask.where(mask_Rockies) plot_maps.plot_labels(xr_mask) # In[9]: # ============================================================================= # Clustering co-occurence of anomalies different tfreqs # =============================================================================
def PDO(filename, ex, df_splits=None): #%% ''' PDO is calculated based upon all data points in the training years, Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train) to enable retrieving the PDO timeseries on a subset on the year. It is similarly also projected on the dates_test From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf ''' t0 = time() if df_splits is None: RV = ex[ex['RV_name']] df_splits, ex = functions_pp.rand_traintest_years(RV, ex) kwrgs_pp = { 'selbox': { 'la_min': 20, # select domain in degrees east 'la_max': 65, 'lo_min': 115, 'lo_max': 250 }, 'format_lon': 'only_east' } ds = core_pp.import_ds_lazy(filename, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = functions_pp.time_mean_bins(ds, ex, to_freq=to_freq, seldays='all') ds['time'] = dates dates = pd.to_datetime(ds.time.values) splits = df_splits.index.levels[0] data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size)) PDO_patterns = xr.DataArray( data, coords=[splits, ds.latitude.values, ds.longitude.values], dims=['split', 'latitude', 'longitude']) def PDO_single_split(s, ds, df_splits, PDO_patterns): progress = 100 * (s + 1) / splits.size dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s] ['TrainIsTrue']].index train_yrs = np.unique(dates_train.year) dates_all_train = pd.to_datetime( [d for d in dates if d.year in train_yrs]) ### dates_train_yrs = ### dates_test = df_splits.loc[s]['TrainIsTrue'][ ~df_splits.loc[s]['TrainIsTrue']].index n = dates_train.size r = int(100 * n / df_splits.loc[s].index.size) print( f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)", end="") PDO_pattern, solver, adjust_sign = get_PDO( ds.sel(time=dates_all_train)) data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train), PDO_patterns[s]) data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test), PDO_patterns[s]) df_test = pd.DataFrame(data=data_test.values, index=dates_test, columns=['0_901_PDO']) df_train = pd.DataFrame(data=data_train.values, index=dates_train, columns=['0_901_PDO']) df = pd.concat([df_test, df_train]).sort_index() return (df, PDO_pattern) pool = ProcessPoolExecutor(os.cpu_count() - 1) # amount of cores - 1 futures = [ pool.submit(PDO_single_split, s, ds, df_splits, PDO_patterns) for s in splits ] results = [future.result() for future in futures] list_splits = [r[0] for r in results] time_ = time() - t0 print(time_ / 60) for s in splits: PDO_patterns[s] = results[s][1] df_PDO = pd.concat(list_splits, axis=0, keys=splits) #%% return df_PDO, PDO_patterns
def create_mask(path, kwrgs_load={}, level='Continents'): ''' Parameters ---------- path: str full path to netcdf file for which you want to create a mask kwrgs_load : Dict, optional See kwargs core_pp.import_ds_lazy? Level : TYPE, optional Countries or Continents The default is 'Continents'. Returns ------- xr.DataArray mask with labels for each country. ''' f_name = os.path.splitext(path)[0].split('/')[-1] folder_file = '/'.join(os.path.splitext(path)[0].split('/')[:-1]) mask_dir = os.path.join(folder_file, 'masks') if os.path.isdir(mask_dir) != True: os.makedirs(mask_dir) mask_file = os.path.join(mask_dir, f_name + '_' + level) if 'selbox' in kwrgs_load.keys(): lo_min, lo_max, la_min, la_max = kwrgs_load['selbox'] domainstr = '_lats[{}_{}]_lons[{}_{}]'.format(int(la_min), int(la_max), int(lo_min), int(lo_max)) mask_file = mask_file + domainstr if os.path.exists(mask_file + '.nc'): return core_pp.import_ds_lazy(mask_file + '.nc'), Country ds = core_pp.import_ds_lazy(path, **kwrgs_load) # Load Coordinates and Normalize to ShapeFile Coordinates coordinates = era_coordinate_grid(ds) coordinates[..., 0][coordinates[..., 0] > 180] -= 360 # Take Center of Grid Cell as Coordinate coordinates[..., 0] += (coordinates[0, 1, 0] - coordinates[0, 0, 0]) / 2 coordinates[..., 1] += (coordinates[1, 0, 1] - coordinates[0, 0, 1]) / 2 # Create Mask if level == 'Continents': mask = Continent_mask(coordinates.reshape(-1, 2)).reshape( coordinates.shape[:2]) elif level == 'Countries': mask = country_mask(coordinates.reshape(-1, 2)).reshape( coordinates.shape[:2]) country_code = [{ k: Country.__getitem__(k).value } for k in Country._member_names_] # np.save(mask_file+'.npy', mask) if 'time' in ds.dims: mask_xr = ds.isel(time=0).copy().drop('time') else: mask_xr = ds.copy() mask_xr.name = 'country_mask' for dic in country_code: key, value = list(dic.items())[0] mask_xr.attrs[key] = value # mask_xr.attrs = {'country_code': country_code} mask_xr.values = mask mask_xr = mask_xr.astype(int) mask_xr.to_netcdf(mask_file + '.nc', mode='w') return mask_xr, Country
**kwrgs_plot) plt.savefig(os.path.join(rg.path_outsub1, f'snapshots_{var}_rm{rm}.pdf')) #%% Correlation PNA-like RW with Wavenumber 6 phase 2 # only for eastern import core_pp, find_precursors values = [] if west_or_east == 'eastern': lags_list = range(-10,10) for lag in lags_list: selbox = (0,360,25,60) # selbox = (140,300,20,73) tfreq = 1 # lag = 0 dates_RV = core_pp.get_subdates(pd.to_datetime(rg.fulltso.time.values), start_end_date=rg.start_end_TVdate) RV_ts = rg.fulltso.sel(time=dates_RV) ds_v300 = core_pp.import_ds_lazy(rg.list_precur_pp[1][1]) dslocal = core_pp.get_selbox(ds_v300, selbox=selbox) datesRW = core_pp.get_subdates(pd.to_datetime(dslocal.time.values), start_end_date=rg.start_end_TVdate) datesRW = datesRW + pd.Timedelta(f'{lag}d') dslocal = dslocal.sel(time=datesRW) wv6local = core_pp.get_selbox(xarray.sel(lag=5), selbox=selbox) patternlocal = wv6local.mean(dim='lag') ts = find_precursors.calc_spatcov(dslocal, patternlocal) ts_15, d = functions_pp.time_mean_bins(ts, tfreq, start_end_date=start_end_TVdate, closed_on_date=start_end_TVdate[-1]) RV_15, d = functions_pp.time_mean_bins(RV_ts, tfreq, start_end_date=start_end_TVdate,
def PDO(filepath, df_splits=None, n_jobs=1): #%% ''' PDO is calculated based upon all data points in the training years, Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train) to enable retrieving the PDO timeseries on a subset on the year. It is similarly also projected on the dates_test From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf selbox has format of (lon_min, lon_max, lat_min, lat_max) ''' t0 = time() # old format selbox # {'la_min':20, # select domain in degrees east # 'la_max':70, # 'lo_min':115, # 'lo_max':250}, kwrgs_pp = {'selbox': (115, 250, 20, 70), 'format_lon': 'only_east'} ds = core_pp.import_ds_lazy(filepath, **kwrgs_pp) ds_monthly = ds.resample(time='M', restore_coord_dims=False).mean(dim='time', skipna=True) # ds_global = core_pp.import_ds_lazy(filepath) # ds.mean(dim=('latitude','longitude')) # global mean SST anomaly each timestep if df_splits is None: print('No train-test split') iterables = [np.array([0]), pd.to_datetime(ds.time.values)] df_splits = pd.DataFrame(data=np.ones(ds.time.size), index=pd.MultiIndex.from_product(iterables), columns=['TrainIsTrue'], dtype=bool) splits = df_splits.index.levels[0] data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size)) PDO_patterns = xr.DataArray( data, coords=[splits, ds.latitude.values, ds.longitude.values], dims=['split', 'latitude', 'longitude']) if n_jobs > 1: with ProcessPoolExecutor(max_workers=os.cpu_count()) as pool: futures = [ pool.submit(PDO_single_split, s, ds_monthly, ds, df_splits) for s in range(splits.size) ] results = [future.result() for future in futures] else: results = [ PDO_single_split(s, ds_monthly, ds, df_splits) for s in range(splits.size) ] list_PDO_ts = [r[0] for r in results] time_ = time() - t0 print(time_ / 60) for s in splits: PDO_patterns[s] = results[s][1] df_PDO = pd.concat(list_PDO_ts, axis=0, keys=splits) # merge df_splits df_PDO = df_PDO.merge(df_splits, left_index=True, right_index=True) if splits.size > 1: # train test splits should not be equal assert float((df_PDO.loc[1] - df_PDO.loc[0]).mean()) != 0, ( 'something ' 'went wrong with train test splits') #%% return df_PDO, PDO_patterns
def spatial_valid(var_filename, mask, y_pred_all, y_pred_c, lags_i=None, seldates=None, clusters=None, kwrgs_events=None, alpha=0.05, n_boot=0, blocksize=10, threshold_pred='upper_clim'): ''' var_filename must be 3d netcdf file with only one variable mask can be nc file containing only a mask, or a latlon box in format [west_lon, east_lon, south_lat, north_lat] in format in common west-east degrees ''' var_filename = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/preprocessed/t2mmax_US_1979-2018_1jan_31dec_daily_0.25deg.nc' mask = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/preprocessed/cluster_output.nc' if lags_i is None: lags_i = list(y_pred_all.columns) # load in daily xarray and mask xarray = core_pp.import_ds_lazy(var_filename) npmask = cl.get_spatial_ma(var_filename, mask) # process temporal infor freq = (y_pred_c.index[1] - y_pred_c.index[0]).days if seldates is None: seldates = aggr_to_daily_dates(y_pred_c.index) start = f'{seldates[0].month}-{seldates[0].day}' end = f'{seldates[-1].month}-{seldates[-1].day}' start_end_date = (start, end) xarray, dates = functions_pp.time_mean_bins(xarray, to_freq=freq, start_end_date=start_end_date) # if switching to event timeseries: if kwrgs_events is None: kwrgs_events = {'event_percentile': 66} # unpack other optional arguments for defining event timeseries kwrgs = { key: item for key, item in kwrgs_events.items() if key != 'event_percentile' } if clusters is None: clusters = list(np.unique(npmask[~np.isnan(npmask)])) elif type(clusters) is int: clusters = [clusters] elif clusters is not None: clusters = clusters dict_allclus = {} for clus in clusters: latloni = np.where(npmask == clus) latloni = [(latloni[0][i], latloni[1][i]) for i in range(latloni[0].size)] futures = {} with ProcessPoolExecutor(max_workers=max_cpu) as pool: for ll in latloni: latloni = latloni xr_gridcell = xarray.isel(latitude=ll[0]).isel(longitude=ll[1]) threshold = func_fc.Ev_threshold( xr_gridcell, kwrgs_events['event_percentile']) y_i = func_fc.Ev_timeseries(xr_gridcell, threshold, **kwrgs)[0] futures[ll] = pool.submit(valid.get_metrics_sklearn, y_i.values, y_pred_all[lags_i], y_pred_c, alpha=alpha, n_boot=n_boot, blocksize=blocksize, threshold_pred=threshold_pred) results = {key: future.result() for key, future in futures.items()} dict_allclus[clus] = results df_valid = dict_allclus[clus][ll][0] metrics = np.unique(df_valid.index.get_level_values(0)) lags_tf = [l * freq for l in lags_i] if freq != 1: # the last day of the time mean bin is tfreq/2 later then the centerered day lags_tf = [ l_tf - int(freq / 2) if l_tf != 0 else 0 for l_tf in lags_tf ] for clus in clusters: results = dict_allclus[clus] xroutput = xarray.isel(time=lags_i).rename({'time': 'lag'}) xroutput['lag'] = lags_tf xroutput = xroutput.expand_dims({'metric': metrics}, 0) npdata = np.array(np.zeros_like(xroutput), dtype='float32') for ll in latloni: df_valid = dict_allclus[clus][ll][0] for i, met in enumerate(metrics): lat_i = ll[0] lon_i = ll[1] npdata[i, :, lat_i, lon_i] = df_valid.loc[met].loc[met] xroutput.values = npdata plot_maps.plot_corr_maps(xroutput.where(npmask == clus), row_dim='metric', size=4, clevels=np.arange(-1, 1.1, 0.2)) BSS = xroutput.where(npmask == clus).sel(metric='BSS') plot_maps.plot_corr_maps(BSS, row_dim='metric', size=4, clevels=np.arange(-0.25, 0.251, 0.05), cbar_vert=-0.1)
def ENSO_34(filepath, df_splits=None, get_ENSO_states: bool = True): #%% # file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc' ''' See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf selbox has format of (lon_min, lon_max, lat_min, lat_max) ''' # if df_splits is None: # seldates = None # else: # seldates = df_splits.loc[0].index # {'la_min':-5, # select domain in degrees east # 'la_max':5, # 'lo_min':-170, # 'lo_max':-120}, kwrgs_pp = { 'selbox': (190, 240, -5, 5), 'format_lon': 'only_east', 'seldates': None } ds = core_pp.import_ds_lazy(filepath, **kwrgs_pp) dates = pd.to_datetime(ds.time.values) data = functions_pp.area_weighted(ds).mean(dim=('latitude', 'longitude')) df_ENSO = pd.DataFrame(data=data.values, index=dates, columns=['ENSO34']) if df_splits is not None: splits = df_splits.index.levels[0] df_ENSO = pd.concat([df_ENSO] * splits.size, axis=0, keys=splits) if get_ENSO_states: ''' From Anderson 2017 - Life cycles of agriculturally relevant ENSO teleconnections in North and South America. http://doi.wiley.com/10.1002/joc.4916 mean boreal wintertime (October, November, December) SST anomaly amplitude in the Niño 3.4 region exceeded 1 of 2 standard deviation. ''' if hasattr(df_ENSO.index, 'levels'): df_ENSO_s = df_ENSO.loc[0] else: df_ENSO_s = df_ENSO dates = df_ENSO_s.index df_3monthmean = df_ENSO_s.rolling(3, center=True, min_periods=1).mean() std_ENSO = df_3monthmean.std() OND, groups = core_pp.get_subdates(dates, start_end_date=('10-01', '12-31'), returngroups=True) OND_ENSO = df_3monthmean.loc[OND].groupby(groups).mean() nino_yrs = OND_ENSO[OND_ENSO > df_3monthmean.mean() + std_ENSO][:].dropna().index #+ 1 nina_yrs = OND_ENSO[OND_ENSO < df_3monthmean.mean() - std_ENSO][:].dropna().index #+ 1 neutral = [ y for y in OND_ENSO.index if y not in core_pp.flatten([nina_yrs, nino_yrs]) ] states = {} for i, d in enumerate(dates): if d.year in nina_yrs: states[d.year] = -1 if d.year in neutral: states[d.year] = 0 if d.year in nino_yrs: states[d.year] = 1 cycle_list = [] for s, v in [('EN', 1), ('LN', -1)]: ENSO_cycle = {d.year: 0 for d in dates} for i, year in enumerate(np.unique(dates.year)): # d = dates[1] # if states[year] == v: # s = 'EN' # elif states[year] == -1: # s = 'LN' if states[year] == v: ENSO_cycle[year] = f'{s}0' if year - 1 in dates.year and states[year - 1] != v: ENSO_cycle[year - 1] = f'{s}-1' if year + 1 in dates.year and states[year + 1] != v: ENSO_cycle[year + 1] = f'{s}+1' cycle_list.append(ENSO_cycle) time_index = pd.to_datetime([f'{y}-01-01' for y in states.keys()]) df_state = pd.concat([ pd.Series(states), pd.Series(cycle_list[0]), pd.Series(cycle_list[1]) ], axis=1, keys=['state', 'EN_cycle', 'LN_cycle']) df_state.index = time_index if hasattr(df_ENSO.index, 'levels'): # copy to other traintest splits df_state = pd.concat([df_state] * splits.size, keys=splits) composites = np.zeros(3, dtype=object) for i, yrs in enumerate([nina_yrs, neutral, nino_yrs]): composite = [d for d in dates if d.year in yrs] composites[i] = ds.sel(time=composite).mean(dim='time') composites = xr.concat(composites, dim='state') composites['state'] = ['Nina', 'Neutral', 'Nino'] plot_maps.plot_corr_maps(composites, row_dim='state', hspace=0.5) out = df_ENSO, [ np.array(nina_yrs), np.array(neutral), np.array(nino_yrs) ], df_state else: out = df_ENSO #%% return out