def spatial_mean_clusters(var_filename, xrclust, kwrgs_load: dict = {}): #%% if type(var_filename) is str: xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_load) elif type(var_filename) is xr.DataArray: xarray = var_filename else: raise TypeError('Give var_filename as str or xr.DataArray') labels = xrclust.values nparray = xarray.values track_names = [] area_grid = find_precursors.get_area(xarray) regions_for_ts = list(np.unique(labels[~np.isnan(labels)])) a_wghts = area_grid / area_grid.mean() # this array will be the time series for each feature ts_clusters = np.zeros((xarray.shape[0], len(regions_for_ts))) # calculate area-weighted mean over labels for r in regions_for_ts: track_names.append(int(r)) idx = regions_for_ts.index(r) # start with empty lonlat array B = np.zeros(xrclust.shape) # Mask everything except region of interest B[labels == r] = 1 # Calculates how values inside region vary over time ts_clusters[:, idx] = np.nanmean(nparray[:, B == 1] * a_wghts[B == 1], axis=1) xrts = xr.DataArray(ts_clusters.T, coords={ 'cluster': track_names, 'time': xarray.time }, dims=['cluster', 'time']) # extract selected setting for ts dims = list(xrclust.coords.keys()) standard_dim = ['latitude', 'longitude', 'time', 'mask', 'cluster'] dims = [d for d in dims if d not in standard_dim] if 'n_clusters' in dims: idx = dims.index('n_clusters') dims[idx] = 'ncl' xrclust = xrclust.rename({'n_clusters': dims[idx]}).copy() var1 = str(xrclust[dims[0]]) dim1 = dims[0] xrts.attrs[dim1] = var1 xrclust.attrs[dim1] = var1 xrclust = xrclust.drop(dim1) if len(dims) == 2: var2 = int(xrclust[dims[1]]) dim2 = dims[1] xrts.attrs[dim2] = var2 xrclust.attrs[dim2] = var2 xrclust = xrclust.drop(dim2) ds = xr.Dataset({'xrclustered': xrclust, 'ts': xrts}) #%% return ds
def loop_get_spatcov(precur, precur_aggr=None, kwrgs_load: dict = None, force_reload: bool = False, lags: list = None): name = precur.name use_sign_pattern = precur.use_sign_pattern corr_xr = precur.corr_xr prec_labels = precur.prec_labels splits = corr_xr.split if lags is not None: lags = np.array(lags) # ensure lag is np.ndarray corr_xr = corr_xr.sel(lag=lags).copy() prec_labels = prec_labels.sel(lag=lags).copy() else: lags = prec_labels.lag.values dates = pd.to_datetime(precur.precur_arr.time.values) oneyr = functions_pp.get_oneyr(dates) if oneyr.size == 1: # single val per year precursor tfreq = 365 else: tfreq = (oneyr[1] - oneyr[0]).days if precur_aggr is None and force_reload == False: precur_arr = precur.precur_arr if tfreq == 365: precur_arr = precur.precur_arr # use precursor array with temporal aggregation that was used to create # correlation map. When tfreq=365, aggregation (one-value-per-year) # is already done. period used to aggregate was defined by the lag else: if precur_aggr is not None: precur.tfreq = precur_aggr precur.load_and_aggregate_precur(kwrgs_load.copy()) precur_arr = precur.precur_arr precur.area_grid = find_precursors.get_area(precur_arr) if precur_arr.shape[-2:] != corr_xr.shape[-2:]: print('shape loaded precur_arr != corr map, matching coords') corr_xr, prec_labels = functions_pp.match_coords_xarrays( precur_arr, *[corr_xr, prec_labels]) ts_sp = np.zeros((splits.size), dtype=object) for s in splits: ts_list = np.zeros((lags.size), dtype=list) track_names = [] for il, lag in enumerate(lags): # if lag represents aggregation period: if type(precur.lags[il]) is np.ndarray and precur_aggr is None: precur_arr = precur.precur_arr.sel(lag=il) corr_vals = corr_xr.sel(split=s).isel(lag=il) mask = prec_labels.sel(split=s).isel(lag=il) pattern = corr_vals.where(~np.isnan(mask)) if use_sign_pattern == True: pattern = np.sign(pattern) if np.isnan(pattern.values).all(): # no regions of this variable and split nants = np.zeros((precur_arr.time.size, 1)) nants[:] = np.nan ts_list[il] = nants pass else: # if normalize == True: # spatcov_full = calc_spatcov(full_timeserie, pattern) # mean = spatcov_full.sel(time=dates_train).mean(dim='time') # std = spatcov_full.sel(time=dates_train).std(dim='time') # spatcov_test = ((spatcov_full - mean) / std) # elif normalize == False: xrts = find_precursors.calc_spatcov(precur_arr, pattern) ts_list[il] = xrts.values[:, None] track_names.append(f'{lag}..0..{precur.name}' + '_sp') # concatenate timeseries all of lags tsCorr = np.concatenate(tuple(ts_list), axis=1) dates = pd.to_datetime(precur_arr.time.values) ts_sp[s] = pd.DataFrame(tsCorr, index=dates, columns=track_names) # df_sp = pd.concat(list(ts_sp), keys=range(splits.size)) return ts_sp
def percentile_cluster(var_filename, xrclust, q=75, tailmean=True, selbox=None): xarray = core_pp.import_ds_lazy(var_filename, selbox=selbox) labels = xrclust.values nparray = xarray.values n_t = xarray.time.size track_names = [] area_grid = find_precursors.get_area(xarray) regions_for_ts = list(np.unique(labels[~np.isnan(labels)])) if tailmean: tmp_wgts = (area_grid / area_grid.mean())[:, :] a_wghts = np.tile(tmp_wgts[None, :], (n_t, 1, 1)) else: a_wghts = area_grid / area_grid.mean() # this array will be the time series for each feature ts_clusters = np.zeros((xarray.shape[0], len(regions_for_ts))) # calculate area-weighted mean over labels for r in regions_for_ts: track_names.append(int(r)) idx = regions_for_ts.index(r) # start with empty lonlat array B = np.zeros(xrclust.shape) # Mask everything except region of interest B[labels == r] = 1 # Calculates how values inside region vary over time if tailmean == False: ts_clusters[:, idx] = np.nanpercentile(nparray[:, B == 1] * a_wghts[B == 1], q=q, axis=1) elif tailmean: # calc percentile of space for each timestep, not we will # have a timevarying spatial mask. ts_clusters[:, idx] = np.nanpercentile(nparray[:, B == 1], q=q, axis=1) # take a mean over all gridpoints that pass the percentile instead # of taking the single percentile value of a spatial region mask_B_perc = nparray[:, B == 1] > ts_clusters[:, idx, None] # if unlucky, the amount of gridcells that pass the percentile # value, were not always equal in each timestep. When this happens, # we can no longer reshape the array to (time, space) axis, and thus # we cannot take the mean over time. # check if have same size over time cs_ = [ int(mask_B_perc[t][mask_B_perc[t]].shape[0]) for t in range(n_t) ] if np.unique(cs_).size != 1: # what is the most common size: common_shape = cs_[np.argmax( [cs_.count(v) for v in np.unique(cs_)])] # convert all masks to most common size by randomly # adding/removing a True for t in range(n_t): while mask_B_perc[t][ mask_B_perc[t]].shape[0] < common_shape: mask_B_perc[t][np.argwhere( mask_B_perc[t] == False)[0][0]] = True while mask_B_perc[t][ mask_B_perc[t]].shape[0] > common_shape: mask_B_perc[t][np.argwhere( mask_B_perc[t] == True)[0][0]] = False nptimespacefull = nparray[:, B == 1].reshape(nparray.shape[0], -1) npuppertail = nptimespacefull[mask_B_perc] wghtsuppertail = a_wghts[:, B == 1][mask_B_perc] y = np.nanmean(npuppertail.reshape(n_t,-1) * \ wghtsuppertail.reshape(n_t,-1), axis =1) ts_clusters[:, idx] = y xrts = xr.DataArray(ts_clusters.T, coords={ 'cluster': track_names, 'time': xarray.time }, dims=['cluster', 'time']) return xrts