def eof_correlation(eof_filepath, mask_filepath):
    """ Returns plot and DataArray of areas with p<0.05."""

    print("processing precipitation")
    da = era5.download_data(mask_filepath, xarray=True)
    tp_ds = da.mean(dim=["latitude", "longitude"]).tp
    tp = tp_ds.assign_coords(time=(tp_ds.time.astype("datetime64")))
    tp_df = tp.to_dataframe()

    print("processing EOF")
    eof_da = xr.open_dataset(eof_filepath)
    eof_ds = eof_da.EOF
    eof = eof_ds.assign_coords(time=(eof_ds.time.astype("datetime64")))
    eof_df = eof.to_dataframe()
    eof_pv = pd.pivot_table(eof_df,
                            values="EOF",
                            index=["time"],
                            columns=["latitude", "longitude"])
    eof_reset = eof_pv.reset_index()
    eof_reset["time"] -= np.timedelta64(12, "h")

    print("combining")
    df_combined = pd.merge_ordered(tp_df, eof_reset, on="time")
    df_clean = df_combined.dropna()

    corr_s = df_clean.corrwith(df_clean["tp"])
    corr_df = corr_s.to_frame(name="corr")
    corr_df["pvalue"] = pvalue(df_clean)

    filepath = "_Data/EOF_corr_pval.csv"
    corr_df.to_csv(filepath)

    return filepath
def spatial_autocorr(variable, mask_filepath):  # TODO
    """ Plots spatial autocorrelation """

    df = era5.download_data(mask_filepath)
    # detrend
    table = pd.pivot_table(df,
                           values="tp",
                           index=["latitude", "longitude"],
                           columns=["time"])
    trans_table = table.T
    detrended_table = detrend(trans_table, axis=0)
    corr_table = detrended_table.corr()
    print(corr_table)

    corr_khyber = corr_table.loc[(34.5, 73.0)]
    corr_gilgit = corr_table.loc[(36.0, 75.0)]
    corr_ngari = corr_table.loc[(33.5, 79.0)]

    corr_list = [corr_khyber, corr_gilgit, corr_ngari]

    for corr in corr_list:

        df_reset = corr.reset_index().droplevel(1, axis=1)
        df_pv = df_reset.pivot(index="latitude", columns="longitude")
        df_pv = df_pv.droplevel(0, axis=1)
        da = xr.DataArray(data=df_pv, name="Correlation")

        # Plot

        plt.figure()
        ax = plt.subplot(projection=ccrs.PlateCarree())
        ax.set_extent([71, 83, 30, 38])
        da.plot(
            x="longitude",
            y="latitude",
            add_colorbar=True,
            ax=ax,
            vmin=-1,
            vmax=1,
            cmap="coolwarm",
            cbar_kwargs={"pad": 0.10},
        )
        ax.gridlines(draw_labels=True)
        ax.set_xlabel("Longitude")
        ax.set_ylabel("Latitude")

    plt.show()
def random_location_generator(location, N=50):
    """ Returns DataFrame of random location, apply to clean df only """

    coord_list = []

    df = era5.download_data(location)
    df_squished = df[["lat", "lon"]].reset_index()
    df_s_reset = df_squished.drop_duplicates(subset=["lat", "lon"])

    indices = np.random.randint(len(df_s_reset), size=N)

    for i in indices:
        df_location = df_s_reset.iloc[i]
        lat = df_location["lat"]
        lon = df_location["lon"]
        coord_list.append([lat, lon])

    return coord_list
def averaged_timeseries(mask_filepath,
                        variable="tp",
                        longname="Total precipitation [m/day]"):
    """ Timeseries for the Upper Indus Basin"""

    df = era5.download_data(mask_filepath)

    df_var = df[["time", variable]]

    df_var["time"] = df_var["time"].astype(np.datetime64)
    df_mean = df_var.groupby("time").mean()

    df_mean.plot()
    plt.title("Upper Indus Basin")
    plt.ylabel(longname)
    plt.xlabel("Year")
    plt.grid(True)

    plt.show()
def tp_vs(variable, longname="", location='uib', time=False):
    """
    df = dp.download_data(mask_filepath)
    df_var = df[['time','tp', variable]]
    #df_mean = df_var.groupby('time').mean()
    """

    ds = era5.download_data(location, xarray=True)

    if type(location) is str:
        mask_filepath = dp.find_mask(location)
        masked_ds = location_sel.apply_mask(ds, mask_filepath)
    else:
        masked_ds = ds.interp(coords={
            "lon": location[1],
            "lat": location[0]
        },
                              method="nearest")

    if type(time) == str:
        masked_ds = ds.isel(time=-6)

    multiindex_df = masked_ds.to_dataframe()
    df_clean = multiindex_df.dropna().reset_index()

    df = df_clean[["tp", variable]]
    df_var = df.dropna()

    # Plot
    df_var.plot.scatter(x=variable, y="tp", alpha=0.2, c="b")
    if type(location) is str:
        plt.title(location)
    else:
        plt.title(str(location[0]) + '°N, ' + str(location[1]) + '°E')
    plt.ylabel("Total precipitation [mm/day]")
    plt.xlabel(longname)
    plt.grid(True)
    plt.show()
def input_correlation_heatmap():
    """Plot correlation heatmap for model inputs."""

    df = era5.download_data(mask_filepath, all_var=True)

    # create lags
    df["N34-1"] = df["N34"].shift(periods=393)
    df["NAO-1"] = df["NAO"].shift(periods=393)
    df["N4-1"] = df["N4"].shift(periods=393)

    df = df.drop(columns=["time"])
    df_clean = df.dropna()
    df_sorted = df_clean.sort_index(axis=1)
    corr = df_sorted.corr()

    sns.set(style="white")
    plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    mask = np.triu(np.ones_like(
        corr, dtype=np.bool))  # generate a mask for the upper triangle
    sns.heatmap(
        corr,
        mask=mask,
        cmap=cmap,
        center=0,
        vmin=-1,
        vmax=1,
        fmt="0.2f",
        square=True,
        linewidths=0.5,
        annot=True,
        annot_kws={"size": 5},
        cbar_kws={"shrink": 0.5},
    )
    plt.title("Correlation plot for Upper Indus Basin")

    plt.show()
def annual_map(location, variable, year, cumulative=False):
    """ Annual map """

    da = era5.download_data(location, xarray=True)
    ds_year = da.sel(time=slice(
        str(year) + "-01-16T12:00:00",
        str(year + 1) + "-01-01T12:00:00"))
    ds_var = ds_year[variable] * 1000  # to mm/day

    if cumulative is True:
        ds_processed = cumulative_monthly(ds_var)
        ds_final = ds_processed.sum(dim="time")
    else:
        ds_final = ds_var.std(dim="time")  # TODO weighted mean

    print(ds_final)

    plt.figure()
    ax = plt.subplot(projection=ccrs.PlateCarree())
    ax.set_extent([71, 83, 30, 38])
    g = ds_final["tp_0001"].plot(
        cmap="magma_r",
        vmin=0.001,
        cbar_kwargs={
            "label": "Precipitation standard deviation [mm/day]",
            "extend": "neither",
            "pad": 0.10
        })
    g.cmap.set_under("white")
    # ax.add_feature(cf.BORDERS)
    ax.coastlines()
    ax.gridlines(draw_labels=True)
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    plt.title("Upper Indus Basin Total Precipitation " + str(year) + "\n \n")
    plt.show()
Exemplo n.º 8
0
def monthly_PDF(timeseries, variable="tp", longname=""):

    combined_df = pd.DataFrame()

    for ts in timeseries:
        df1 = ts.tp.to_dataframe(name=ts.plot_legend)
        df2 = df1.dropna().reset_index()
        df3 = df2.drop(["time", "lon", "lat"], axis=1)
        combined_df[ts.plot_legend] = df3[ts.plot_legend]

    df = era5.download_data(mask_filepath)
    clean_df = df.dropna()
    df_var = clean_df[["time", variable]]
    reduced_df = df_var.reset_index()
    reduced_df["time"] = (reduced_df["time"] -
                          np.floor(reduced_df["time"])) * 12

    grouped_dfs = []

    for m in np.arange(1, 13):
        month_df = reduced_df[reduced_df["time"] == m]
        grouped_dfs.append(month_df[variable])

    # PDF for each month
    """
    fig, axs = plt.subplots(4, 3, sharex=True, sharey=True)

    for i in range(12):
        x= int(i/3)
        y= i%3
        axs[x,y].hist(grouped_dfs[i], bins=50, density=True)
        axs[x,y].set_title(month_dict[i])
        axs[x,y].set_title(month_dict[i])
        axs[x,y].set_xlabel('Total precipation [m]')
        axs[x,y].set_ylabel('Probability density')
        axs[x,y].axvline(np.percentile(grouped_dfs[i], 95), color='k',
        linestyle='dashed', linewidth=1)
    """
    """
    fig, axs = plt.subplots(12, 1, sharex=True, sharey=True, figsize=(5, 50))

    for i in range(12):
        axs[i].hist(grouped_dfs[i], bins=50, density=True)
        axs[i].set_title(month_dict[i])
        axs[i].set_title(month_dict[i])
        axs[i].set_xlabel('Total precipation [m]')
        axs[i].set_ylabel('Probability density')
        axs[i].axvline(np.percentile(grouped_dfs[i], 95), color='k',
        linestyle='dashed', linewidth=1)
    """

    _fig, axs = plt.subplots(3, 4, sharex=True, sharey=True)

    for i in range(12):
        x = (i) % 3
        y = int(i / 3)
        axs[x, y].hist(grouped_dfs[i], density=True)
        axs[x, y].set_title(month_dict[i])
        axs[x, y].set_title(month_dict[i])
        axs[x, y].xaxis.set_tick_params(which="both", labelbottom=True)
        axs[x, y].yaxis.set_tick_params(which="both", labelbottom=True)
        axs[x, y].set_xlabel(longname)
        axs[x, y].set_ylabel("Probability density")
        axs[x, y].axvline(
            np.percentile(grouped_dfs[i], 95),
            color="k",
            linestyle="dashed",
            linewidth=1,
            label="95th percentile",
        )

    plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.30))

    plt.show()
def point_model(location, number=None, EDA_average=False, maxyear=None):
    """
    Outputs test, validation and training data for total precipitation
    as a function of time, 2m dewpoint temperature, angle of sub-gridscale
    orography, orography, slope of sub-gridscale orography, total column
    water vapour, Nino 3.4, Nino 4 and NAO index for a single point.

    Inputs
        number, optional: specify desired ensemble run, integer
        EDA_average, optional: specify if you want average of low resolution
            ensemble runs, boolean
        coords [latitude, longitude], optional: specify if you want a specific
            location, list of floats
        mask_filepath, optional:

    Outputs
        x_train: training feature vector, numpy array
        y_train: training output vector, numpy array
        x_test: testing feature vector, numpy array
        y_test: testing output vector, numpy array
    """
    if number is not None:
        da_ensemble = era5.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.sel(number=number).drop("number")
    if EDA_average is True:
        da_ensemble = era5.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.mean(dim="number")
    else:
        da = era5.download_data(location, xarray=True)

    if location is str:
        multiindex_df = da.to_dataframe()
        df_clean = multiindex_df.dropna().reset_index()
        df_location = sa.random_location_sampler(df_clean)
        df = df_location.drop(columns=["lat", "lon", "slor", "anor", "z"])

    else:
        da_location = da.interp(coords={
            "lat": location[0],
            "lon": location[1]
        },
                                method="nearest")
        multiindex_df = da_location.to_dataframe()
        df_clean = multiindex_df.dropna().reset_index()
        df = df_clean.drop(columns=["lat", "lon", "slor", "anor", "z"])

    if maxyear is not None:
        df["time"] = df[df["time"] < maxyear]

    df["time"] = df["time"] - 1970  # to years
    df["tp"] = log_transform(df["tp"])
    df = df[["time", "d2m", "tcwv", "N34", "tp"]]  # format order

    # Keep first of 70% for training
    train_df = df[df["time"] < df["time"].max() * 0.7]
    xtrain = train_df.drop(columns=["tp"]).values
    ytrain = train_df["tp"].values

    # Last 30% for evaluation
    eval_df = df[df["time"] > df["time"].max() * 0.7]
    x_eval = eval_df.drop(columns=["tp"]).values
    y_eval = eval_df["tp"].values

    # Training and validation data
    xval, xtest, yval, ytest = train_test_split(x_eval,
                                                y_eval,
                                                test_size=0.3333,
                                                shuffle=False)

    return xtrain, xval, xtest, ytrain, yval, ytest
def areal_model(location,
                number=None,
                EDA_average=False,
                length=3000,
                seed=42,
                maxyear=None):
    """
    Outputs test, validation and training data for total precipitation as a
    function of time, 2m dewpoint temperature, angle of sub-gridscale
    orography, orography, slope of sub-gridscale orography, total column water
    vapour, Nino 3.4 index for given number randomly sampled data points
    for a given basin.

    Inputs
        location: specify area to train model
        number, optional: specify desired ensemble run, integer
        EDA_average, optional: specify if you want average of low resolution
            ensemble runs, boolean
        length, optional: specify number of points to sample, integer
        seed, optional: specify seed, integer

    Outputs
        x_train: training feature vector, numpy array
        y_train: training output vector, numpy array
        x_test: testing feature vector, numpy array
        y_test: testing output vector, numpy array
    """

    if number is not None:
        da_ensemble = era5.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.sel(number=number).drop("number")

    if EDA_average is True:
        da_ensemble = era5.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.mean(dim="number")
    else:
        da = era5.download_data(location, xarray=True)

    # apply mask
    mask_filepath = location_sel.find_mask(location)
    masked_da = location_sel.apply_mask(da, mask_filepath)

    if maxyear is not None:
        masked_da = masked_da.where(da.time < maxyear + 1, drop=True)

    multiindex_df = masked_da.to_dataframe()
    df_clean = multiindex_df.dropna().reset_index()
    df = sa.random_location_and_time_sampler(df_clean,
                                             length=length,
                                             seed=seed)

    df["time"] = df["time"] - 1970
    df["tp"] = log_transform(df["tp"])
    df = df[[
        "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp"
    ]]

    # Keep first of 70% for training
    train_df = df[df['time'] < df['time'].max() * 0.7]
    xtrain = train_df.drop(columns=['tp']).values
    ytrain = train_df['tp'].values

    # Last 30% for evaluation
    eval_df = df[df['time'] > df['time'].max() * 0.7]
    x_eval = eval_df.drop(columns=['tp']).values
    y_eval = eval_df['tp'].values

    # Training and validation data
    xval, xtest, yval, ytest = train_test_split(x_eval,
                                                y_eval,
                                                test_size=0.3333,
                                                shuffle=True)

    return xtrain, xval, xtest, ytrain, yval, ytest
def areal_model_eval(location,
                     number=None,
                     EDA_average=False,
                     length=3000,
                     seed=42,
                     minyear=1979,
                     maxyear=2020):
    """
    Returns data to evaluate an areal model at a given location, area and time
    period.

    Variables:
        Total precipitation as a function of time, 2m dewpoint
        temperature, angle of sub-gridscale orography, orography, slope of
        sub-gridscale orography, total column water vapour, Nino 3.4, Nino 4
        and NAO index for a single point.

    Inputs:
        number, optional: specify desired ensemble run, integer
        EDA_average, optional: specify if you want average of low resolution
            ensemble runs, boolean
        coords [latitude, longitude], optional: specify if you want a specific
            location, list of floats
        mask, optional: specify area to train model, defaults to Upper Indus
            Basin

    Outputs
        x_tr: evaluation feature vector, numpy array
        y_tr: evaluation output vector, numpy array
    """
    if number is not None:
        da_ensemble = era5.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.sel(number=number).drop("number")
    if EDA_average is True:
        da_ensemble = era5.download_data(location, xarray=True, ensemble=True)
        da = da_ensemble.mean(dim="number")
    else:
        da = era5.download_data(location, xarray=True)

    sliced_da = da.sel(time=slice(minyear, maxyear))

    if isinstance(location, str) is True:
        mask_filepath = location_sel.find_mask(location)
        masked_da = location_sel.apply_mask(sliced_da, mask_filepath)
        multiindex_df = masked_da.to_dataframe()
        multiindex_df = da.to_dataframe()
        df_clean = multiindex_df.dropna().reset_index()
        df = sa.random_location_and_time_sampler(df_clean,
                                                 length=length,
                                                 seed=seed)

    else:
        da_location = sliced_da.interp(coords={
            "lat": location[0],
            "lon": location[1]
        },
                                       method="nearest")
        multiindex_df = da_location.to_dataframe()
        df = multiindex_df.dropna().reset_index()

    df["time"] = df["time"] - 1970  # to years
    df["tp"] = log_transform(df["tp"])
    df = df[[
        "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp"
    ]]  # format order

    xtr = df.drop(columns=["tp"]).values
    ytr = df["tp"].values

    return xtr, ytr
def cluster_correlation_heatmap():
    """Plot correlation heatmap for the three clusters."""

    masks = ["Khyber_mask.nc", "Gilgit_mask.nc", "Ngari_mask.nc"]
    names = ["Gilgit regime", "Ngari regime", "Khyber regime"]

    for i in range(3):
        cluster_df = era5.download_data(masks[i])

        # create lags
        cluster_df["CGTI-1"] = cluster_df["CGTI"].shift(periods=1)
        cluster_df["CGTI-2"] = cluster_df["CGTI"].shift(periods=2)
        cluster_df["CGTI-3"] = cluster_df["CGTI"].shift(periods=3)
        cluster_df["CGTI-4"] = cluster_df["CGTI"].shift(periods=4)
        cluster_df["CGTI-5"] = cluster_df["CGTI"].shift(periods=5)
        cluster_df["CGTI-6"] = cluster_df["CGTI"].shift(periods=6)
        """'
        df_combined['N34-1'] = df_combined['N34'].shift(periods=1)
        df_combined['N34-2'] = df_combined['N34'].shift(periods=2)
        df_combined['N34-3'] = df_combined['N34'].shift(periods=3)
        df_combined['N34-4'] = df_combined['N34'].shift(periods=4)
        df_combined['N34-5'] = df_combined['N34'].shift(periods=5)
        df_combined['N34-6'] = df_combined['N34'].shift(periods=6)

        df_combined['N4-1'] = df_combined['N4'].shift(periods=1)
        df_combined['N4-2'] = df_combined['N4'].shift(periods=2)
        df_combined['N4-3'] = df_combined['N4'].shift(periods=3)
        df_combined['N4-4'] = df_combined['N4'].shift(periods=4)
        df_combined['N4-5'] = df_combined['N4'].shift(periods=5)
        df_combined['N4-6'] = df_combined['N4'].shift(periods=6)
        """
        df = cluster_df(columns=["expver", "time"])
        df_clean = df.dropna()
        df_sorted = df_clean.sort_index(axis=1)

        # Correlation matrix
        corr = df_sorted.corr()

        # Plot
        sns.set(style="white")

        plt.subplots(figsize=(11, 9))
        cmap = sns.diverging_palette(220, 10, as_cmap=True)

        # Draw the heatmap with the mask and correct aspect ratio
        mask = np.triu(np.ones_like(
            corr, dtype=np.bool))  # generate a mask for the upper triangle
        sns.heatmap(
            corr,
            mask=mask,
            cmap=cmap,
            center=0,
            vmin=-1,
            vmax=1,
            square=True,
            linewidths=0.5,
            cbar_kws={"shrink": 0.5},
        )

        plt.title(names[i] + "\n")

    plt.show()
Exemplo n.º 13
0
from maps.plot_data import cumulative_monthly

# Filepaths
mask_filepath = "_Data/Masks/ERA5_Upper_Indus_mask.nc"
dem_filepath = "_Data/elev.0.25-deg.nc"


# Function inputs

# Digital Elevation Model data
dem = xr.open_dataset(dem_filepath)
dem_da = (dem.data).sum(dim="time")
sliced_dem = dem_da.sel(lat=slice(38, 30), lon=slice(71.25, 82.75))

# Precipitation data
da = era5.download_data(mask_filepath, xarray=True)
tp_da = da.tp

# Decade list
decades = [1980, 1990, 2000, 2010]

# Cluster list
N = np.arange(2, 11, 1)


def seasonal_clusters(tp_da, sliced_dem, N, decades):
    """
    K-means clustering of precipitation data as a function of seasons, decades
    and number of clusters. Returns spatial graphs, overlayed with the local
    topography contours.