예제 #1
0
def spatial_mean_clusters(var_filename, xrclust, kwrgs_load: dict = {}):
    #%%
    if type(var_filename) is str:
        xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_load)
    elif type(var_filename) is xr.DataArray:
        xarray = var_filename
    else:
        raise TypeError('Give var_filename as str or xr.DataArray')

    labels = xrclust.values
    nparray = xarray.values
    track_names = []
    area_grid = find_precursors.get_area(xarray)
    regions_for_ts = list(np.unique(labels[~np.isnan(labels)]))
    a_wghts = area_grid / area_grid.mean()

    # this array will be the time series for each feature
    ts_clusters = np.zeros((xarray.shape[0], len(regions_for_ts)))

    # calculate area-weighted mean over labels
    for r in regions_for_ts:
        track_names.append(int(r))
        idx = regions_for_ts.index(r)
        # start with empty lonlat array
        B = np.zeros(xrclust.shape)
        # Mask everything except region of interest
        B[labels == r] = 1
        # Calculates how values inside region vary over time
        ts_clusters[:, idx] = np.nanmean(nparray[:, B == 1] * a_wghts[B == 1],
                                         axis=1)
    xrts = xr.DataArray(ts_clusters.T,
                        coords={
                            'cluster': track_names,
                            'time': xarray.time
                        },
                        dims=['cluster', 'time'])

    # extract selected setting for ts
    dims = list(xrclust.coords.keys())
    standard_dim = ['latitude', 'longitude', 'time', 'mask', 'cluster']
    dims = [d for d in dims if d not in standard_dim]
    if 'n_clusters' in dims:
        idx = dims.index('n_clusters')
        dims[idx] = 'ncl'
        xrclust = xrclust.rename({'n_clusters': dims[idx]}).copy()
    var1 = str(xrclust[dims[0]])
    dim1 = dims[0]
    xrts.attrs[dim1] = var1
    xrclust.attrs[dim1] = var1
    xrclust = xrclust.drop(dim1)
    if len(dims) == 2:
        var2 = int(xrclust[dims[1]])
        dim2 = dims[1]
        xrts.attrs[dim2] = var2
        xrclust.attrs[dim2] = var2
        xrclust = xrclust.drop(dim2)
    ds = xr.Dataset({'xrclustered': xrclust, 'ts': xrts})
    #%%
    return ds
예제 #2
0
def loop_get_spatcov(precur,
                     precur_aggr=None,
                     kwrgs_load: dict = None,
                     force_reload: bool = False,
                     lags: list = None):

    name = precur.name
    use_sign_pattern = precur.use_sign_pattern
    corr_xr = precur.corr_xr
    prec_labels = precur.prec_labels
    splits = corr_xr.split
    if lags is not None:
        lags = np.array(lags)  # ensure lag is np.ndarray
        corr_xr = corr_xr.sel(lag=lags).copy()
        prec_labels = prec_labels.sel(lag=lags).copy()
    else:
        lags = prec_labels.lag.values
    dates = pd.to_datetime(precur.precur_arr.time.values)
    oneyr = functions_pp.get_oneyr(dates)
    if oneyr.size == 1:  # single val per year precursor
        tfreq = 365
    else:
        tfreq = (oneyr[1] - oneyr[0]).days

    if precur_aggr is None and force_reload == False:
        precur_arr = precur.precur_arr
        if tfreq == 365:
            precur_arr = precur.precur_arr
        # use precursor array with temporal aggregation that was used to create
        # correlation map. When tfreq=365, aggregation (one-value-per-year)
        # is already done. period used to aggregate was defined by the lag

    else:
        if precur_aggr is not None:
            precur.tfreq = precur_aggr
        precur.load_and_aggregate_precur(kwrgs_load.copy())
        precur_arr = precur.precur_arr

    precur.area_grid = find_precursors.get_area(precur_arr)
    if precur_arr.shape[-2:] != corr_xr.shape[-2:]:
        print('shape loaded precur_arr != corr map, matching coords')
        corr_xr, prec_labels = functions_pp.match_coords_xarrays(
            precur_arr, *[corr_xr, prec_labels])

    ts_sp = np.zeros((splits.size), dtype=object)
    for s in splits:
        ts_list = np.zeros((lags.size), dtype=list)
        track_names = []
        for il, lag in enumerate(lags):

            # if lag represents aggregation period:
            if type(precur.lags[il]) is np.ndarray and precur_aggr is None:
                precur_arr = precur.precur_arr.sel(lag=il)

            corr_vals = corr_xr.sel(split=s).isel(lag=il)
            mask = prec_labels.sel(split=s).isel(lag=il)
            pattern = corr_vals.where(~np.isnan(mask))
            if use_sign_pattern == True:
                pattern = np.sign(pattern)
            if np.isnan(pattern.values).all():
                # no regions of this variable and split
                nants = np.zeros((precur_arr.time.size, 1))
                nants[:] = np.nan
                ts_list[il] = nants
                pass
            else:
                # if normalize == True:
                #     spatcov_full = calc_spatcov(full_timeserie, pattern)
                #     mean = spatcov_full.sel(time=dates_train).mean(dim='time')
                #     std = spatcov_full.sel(time=dates_train).std(dim='time')
                #     spatcov_test = ((spatcov_full - mean) / std)
                # elif normalize == False:
                xrts = find_precursors.calc_spatcov(precur_arr, pattern)
                ts_list[il] = xrts.values[:, None]
            track_names.append(f'{lag}..0..{precur.name}' + '_sp')

        # concatenate timeseries all of lags
        tsCorr = np.concatenate(tuple(ts_list), axis=1)

        dates = pd.to_datetime(precur_arr.time.values)
        ts_sp[s] = pd.DataFrame(tsCorr, index=dates, columns=track_names)
    # df_sp = pd.concat(list(ts_sp), keys=range(splits.size))
    return ts_sp
예제 #3
0
def percentile_cluster(var_filename,
                       xrclust,
                       q=75,
                       tailmean=True,
                       selbox=None):
    xarray = core_pp.import_ds_lazy(var_filename, selbox=selbox)
    labels = xrclust.values
    nparray = xarray.values
    n_t = xarray.time.size
    track_names = []
    area_grid = find_precursors.get_area(xarray)
    regions_for_ts = list(np.unique(labels[~np.isnan(labels)]))

    if tailmean:
        tmp_wgts = (area_grid / area_grid.mean())[:, :]
        a_wghts = np.tile(tmp_wgts[None, :], (n_t, 1, 1))
    else:
        a_wghts = area_grid / area_grid.mean()
    # this array will be the time series for each feature
    ts_clusters = np.zeros((xarray.shape[0], len(regions_for_ts)))

    # calculate area-weighted mean over labels
    for r in regions_for_ts:
        track_names.append(int(r))
        idx = regions_for_ts.index(r)
        # start with empty lonlat array
        B = np.zeros(xrclust.shape)
        # Mask everything except region of interest
        B[labels == r] = 1
        # Calculates how values inside region vary over time
        if tailmean == False:
            ts_clusters[:, idx] = np.nanpercentile(nparray[:, B == 1] *
                                                   a_wghts[B == 1],
                                                   q=q,
                                                   axis=1)
        elif tailmean:
            # calc percentile of space for each timestep, not we will
            # have a timevarying spatial mask.
            ts_clusters[:, idx] = np.nanpercentile(nparray[:, B == 1],
                                                   q=q,
                                                   axis=1)
            # take a mean over all gridpoints that pass the percentile instead
            # of taking the single percentile value of a spatial region
            mask_B_perc = nparray[:, B == 1] > ts_clusters[:, idx, None]
            # if unlucky, the amount of gridcells that pass the percentile
            # value, were not always equal in each timestep. When this happens,
            # we can no longer reshape the array to (time, space) axis, and thus
            # we cannot take the mean over time.
            # check if have same size over time
            cs_ = [
                int(mask_B_perc[t][mask_B_perc[t]].shape[0])
                for t in range(n_t)
            ]
            if np.unique(cs_).size != 1:
                # what is the most common size:
                common_shape = cs_[np.argmax(
                    [cs_.count(v) for v in np.unique(cs_)])]

                # convert all masks to most common size by randomly
                # adding/removing a True
                for t in range(n_t):
                    while mask_B_perc[t][
                            mask_B_perc[t]].shape[0] < common_shape:
                        mask_B_perc[t][np.argwhere(
                            mask_B_perc[t] == False)[0][0]] = True
                    while mask_B_perc[t][
                            mask_B_perc[t]].shape[0] > common_shape:
                        mask_B_perc[t][np.argwhere(
                            mask_B_perc[t] == True)[0][0]] = False

            nptimespacefull = nparray[:, B == 1].reshape(nparray.shape[0], -1)
            npuppertail = nptimespacefull[mask_B_perc]
            wghtsuppertail = a_wghts[:, B == 1][mask_B_perc]

            y = np.nanmean(npuppertail.reshape(n_t,-1) * \
                            wghtsuppertail.reshape(n_t,-1), axis =1)

            ts_clusters[:, idx] = y
    xrts = xr.DataArray(ts_clusters.T,
                        coords={
                            'cluster': track_names,
                            'time': xarray.time
                        },
                        dims=['cluster', 'time'])
    return xrts