예제 #1
0
파일: fit.py 프로젝트: 1895-art/ninolearn
def cross_training(model, pipeline, n_iter, **kwargs):
    """
    Training the model on different training sets in which each time a period\
    corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \
    ovserved date is spared.

    :param model: A model that follows the guidelines how a model object\
    should be set up.

    :param pipeline: a function that takes lead time as argument and returns\
    the corresponding feature, label, time and persistance.

    :param save_dir: The prefix of the save directory.

    :param **kwargs: Arguments that shell be passed to the .set_parameter() \
    method of the provided model.
    """

    for lead_time in lead_times:
        X, y, timey = pipeline(lead_time, return_persistance=False)

        print_header(f'Lead time: {lead_time} month')

        for j in range(n_decades-1):
            m = model(**kwargs)
            dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}"
            path = join(modeldir, dir_name)

            n_files=0
            if exists(path):
                n_files = len(listdir(path))

            if not exists(path) or n_files==0:
                small_print_header(f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

                test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
                train_indeces = np.invert(test_indeces)
                trainX, trainy, traintime = X[train_indeces,:], y[train_indeces], timey[train_indeces]

                m.fit_RandomizedSearch(trainX, trainy, traintime, n_iter=n_iter)
                m.save(location=modeldir, dir_name=dir_name)

            else:
                print(f'{dir_name} already exists')
            del m
from data_pipeline import pipeline

import os
import time
plt.close("all")
K.clear_session()

#%% =============================================================================
# Deep ensemble
# =============================================================================
decades = [60, 70, 80, 90, 100, 110]

for lead_time in [0, 3, 6, 9, 12, 15]:
    X, y, timey, yp = pipeline(lead_time, return_persistance=True)
    print_header(f'Lead time: {lead_time} month')

    for decade in decades:
        small_print_header(
            f'Test period: {1902+decade}-01-01 till {1911+decade}-12-01')

        # jump loop iteration if already trained
        ens_dir = f'ensemble_decade{decade}_lead{lead_time}'
        out_dir = os.path.join(modeldir, ens_dir)

        modified_time = time.gmtime(os.path.getmtime(out_dir))
        compare_time = time.strptime("21-7-2019 13:00 UTC",
                                     "%d-%m-%Y %H:%M %Z")

        if modified_time > compare_time:
            print("Trained already!")
예제 #3
0
def cross_hindcast(model, pipeline, model_name):
    """
    Generate a hindcast from 1962 till today using the models which were
    trained by the .cross_training() method.

    :param model: The considered model.

    :param pipeline: The data pipeline that already was used before in \
    .cross_training().
    """

    first_lead_loop = True

    for i in range(n_lead):
        lead_time = lead_times[i]
        print_header(f'Lead time: {lead_time} months')

        X, y, timey, y_persistance = pipeline(lead_time,
                                              return_persistance=True)

        ytrue = np.array([])
        timeytrue = pd.DatetimeIndex([])

        first_dec_loop = True
        for j in range(n_decades - 1):
            small_print_header(
                f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

            # test indices
            test_indeces = (timey >= f'{decades[j]}-01-01') & (
                timey <= f'{decades[j+1]-1}-12-01')
            testX, testy, testtimey = X[
                test_indeces, :], y[test_indeces], timey[test_indeces]

            m = model()
            m.load(location=modeldir,
                   dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}')

            # allocate arrays and variables for which the model must be loaded
            if first_dec_loop:
                n_outputs = m.n_outputs
                output_names = m.output_names
                pred_full = np.zeros((n_outputs, 0))
                first_dec_loop = False

            # make prediction
            pred = np.zeros((m.n_outputs, testX.shape[0]))
            pred[:, :] = m.predict(testX)

            # make the full time series
            pred_full = np.append(pred_full, pred, axis=1)
            ytrue = np.append(ytrue, testy)
            timeytrue = timeytrue.append(testtimey)
            del m

        if timeytrue[0] != pd.to_datetime('1963-01-01'):
            expected_first_date = '1963-01-01'
            got_first_date = timeytrue[0].isoformat()[:10]

            raise Exception(
                f"The first predicted date for lead time {lead_time} \
                            is {got_first_date} but expected {expected_first_date}"
            )

        # allocate arrays and variables for which the full length of the time
        # series must be known
        if first_lead_loop:
            n_time = len(timeytrue)
            pred_save = np.zeros((n_outputs, n_time, n_lead))
            first_lead_loop = False

        pred_save[:, :, i] = pred_full

    # Save data to a netcdf file
    save_dict = {}
    for i in range(n_outputs):
        save_dict[output_names[i]] = (['target_season',
                                       'lead'], pred_save[i, :, :])

    ds = xr.Dataset(save_dict,
                    coords={
                        'target_season': timeytrue,
                        'lead': lead_times
                    })
    ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc'))
예제 #4
0
                     lr=[0.0001, 0.01],
                     batch_size=100,
                     epochs=500,
                     n_segments=5,
                     n_members_segment=1,
                     patience=30,
                     verbose=0,
                     pdf='normal')

model.fit_RandomizedSearch(trainX, trainy, n_iter=100)

#%%
pred_mean, pred_std = model.predict(testX)

loss = model.evaluate(testy, pred_mean, pred_std)
print_header(f"Loss: {loss}")

if model.pdf == "normal":
    ens_dir = f'ensemble_lead{lead_time}'
elif model.pdf == "skewed":
    ens_dir = f'ensemble_skew_lead{lead_time}'
else:
    ens_dir = f'simple_ensemble_lead{lead_time}'

model.save(location=modeldir, dir_name=ens_dir)

#%% =============================================================================
# Plots
# =============================================================================
plt.close("all")
예제 #5
0
The downloaded data needed to be prepared to have the similiar time-axis.

All spatial data is regridded to the 2.5x2.5 grid of the NCEP
reanalysis data.

Some variables are computed, i.e the wind stress field, the wind speed and
the warm pool edge.
"""
import numpy as np

from ninolearn.utils import print_header
from ninolearn.preprocess.prepare import prep_oni, prep_nino_month, prep_wwv
from ninolearn.preprocess.prepare import prep_iod, prep_K_index, prep_wwv_proxy
from ninolearn.preprocess.prepare import calc_warm_pool_edge, prep_other_forecasts

print_header("Prepare Data")

# =============================================================================
# Prepare the incedes
# =============================================================================
prep_oni()
prep_nino_month(index="3.4")
prep_nino_month(index="3")
prep_nino_month(index="1+2")
prep_nino_month(index="4")
prep_wwv()
prep_wwv(cardinal_direction="west")
prep_iod()
prep_K_index()
prep_wwv_proxy()
예제 #6
0
from ninolearn.utils import print_header
from ninolearn.preprocess.network import networkMetricsSeries

print_header("Network Metrics")

nms_ssh_godas = networkMetricsSeries('sshg',
                                     'GODAS',
                                     processed="anom",
                                     threshold=0.9,
                                     startyear=1980,
                                     endyear=2018,
                                     window_size=12,
                                     lon_min=120,
                                     lon_max=280,
                                     lat_min=-30,
                                     lat_max=30,
                                     verbose=1)
nms_ssh_godas.computeTimeSeries()

nms_ssh_oras4 = networkMetricsSeries('zos',
                                     'ORAS4',
                                     processed="anom",
                                     threshold=0.9,
                                     startyear=1959,
                                     endyear=2017,
                                     window_size=12,
                                     lon_min=120,
                                     lon_max=280,
                                     lat_min=-30,
                                     lat_max=30,
                                     verbose=1)
예제 #7
0
    def fit_RandomizedSearch(self, trainX, trainy,  n_iter=10, **kwargs):
        """
        Hyperparameter optimazation using random search.


        :type trainX: np.ndarray
        :param trainX: The training feature set. 2-D array with dimensions\
        (timesteps, features).

        :type trainy: np.ndarray
        :param trainy: The training label set. 2-D array with dimensions\
        (timesteps, labels).

        :param kwargs: Keyword arguments are passed to the .fit() method.
        """
        # check if hyperparameters where provided in lists for randomized search
        if len(self.hyperparameters_search) == 0:
            raise Exception("No variable indicated for hyperparameter search!")

        #iterate with randomized hyperparameters
        best_loss = np.inf
        for i in range(n_iter):
            print_header(f"Search iteration Nr {i+1}/{n_iter}")

            # random selection of hyperparameters
            for key in self.hyperparameters_search.keys():
                low = self.hyperparameters_search[key][0]
                high = self.hyperparameters_search[key][1]

                if type(low) is float and type(high) is float:
                    self.hyperparameters[key] = np.random.uniform(low, high)

                if type(low) is int and type(high) is int:
                    self.hyperparameters[key] = np.random.randint(low, high+1)

                if type(low) is tuple and type(high) is tuple:
                    hyp_list = []
                    for i in range(len(low)):
                        hyp_list.append(np.random.randint(low[i], high[i]+1))
                    self.hyperparameters[key] = tuple(hyp_list)

            self.fit(trainX, trainy, **kwargs)

            # check if validation score was enhanced
            if self.mean_val_loss<best_loss:
                best_loss = self.mean_val_loss
                self.best_hyperparameters = self.hyperparameters.copy()

                small_print_header("New best hyperparameters")
                print(f"Mean loss: {best_loss}")
                print(self.best_hyperparameters)

        # refit the model with optimized hyperparameter
        # AND to have the weights of the DE for the best hyperparameters again
        print_header("Refit the model with best hyperparamters")

        self.hyperparameters = self.best_hyperparameters.copy()
        print(self.hyperparameters)
        self.fit(trainX, trainy, **kwargs)

        print(f"best loss search: {best_loss}")
        print(f"loss refitting : {self.mean_val_loss}")
예제 #8
0
                     lr=0.001,
                     batch_size=1,
                     epochs=500,
                     n_segments=5,
                     n_members_segment=1,
                     patience=30,
                     verbose=1,
                     std=True)

model.fit(trainX, trainy, testX, testy, use_pretrained=False)

#%%
pred_mean, pred_std = model.predict(testX)

score = model.evaluate(testy, pred_mean, pred_std)
print_header(f"Score: {score}")

if model.std:
    ens_dir = f'ensemble_lead{lead_time}'
else:
    ens_dir = f'simple_ensemble_lead{lead_time}'

model.save(location=modeldir, dir_name=ens_dir)

#%% =============================================================================
# Plots
# =============================================================================
plt.close("all")

# Scores during trianing
plt.subplots()
# preprocess data
feature_unscaled = feature.values.reshape(feature.shape[0], -1)
label_unscaled = label.values.reshape(label.shape[0], -1)

scaler_f = StandardScaler()
Xorg = scaler_f.fit_transform(feature_unscaled)

scaler_l = StandardScaler()
yorg = scaler_l.fit_transform(label_unscaled)

Xall = np.nan_to_num(Xorg)
yall = np.nan_to_num(yorg)

shift = 3
lead = 9
print_header(f'Lead time: {lead} months')

y = yall[lead + shift:]
X = Xall[:-lead - shift]

timey = oni.index[lead + shift:]

y_nino = oni[lead + shift:]

pred_full_oni = np.array([])
true_oni = np.array([])
timeytrue = pd.DatetimeIndex([])

pred_da_full = xr.zeros_like(label[lead + shift:, :, :])

for j in range(n_decades):
예제 #10
0
"""
The following script downloads all data that was relevant for my master thesis.
"""

from ninolearn.download import download, sources
from ninolearn.utils import print_header

print_header("Download Data")

#%%
# =============================================================================
# Single files
# =============================================================================
download(sources.SST_ERSSTv5)
download(sources.ONI)
download(sources.NINOindeces)
download(sources.IOD)
download(sources.HCA)
download(sources.OLR_NOAA)
download(sources.WWV)
download(sources.WWV_West)
download(sources.UWIND_NCEP)
download(sources.VWIND_NCEP)
download(sources.VWIND_NCEP)
download(sources.SAT_monthly_NCEP)
download(sources.otherForecasts)

# =============================================================================
# Multiple files
# =============================================================================
for i in range(1958, 2018):
예제 #11
0
import numpy as np

from ninolearn.download import downloadFileFTP, downloadFileHTTP, download
from ninolearn.private import CMEMS_password, CMEMS_username
from ninolearn.utils import print_header
from ninolearn.sources import SST_ERSSTv5
# =============================================================================
# =============================================================================
# # Download
# =============================================================================
# =============================================================================
print_header("Download Data")


download(SST_ERSSTv5)


#%% =============================================================================
# ERSSTv5
# =============================================================================
ERSSTv5_dict = {
        'filename': 'sst.mnmean.nc',
        'host': 'ftp.cdc.noaa.gov',
        'location': '/Datasets/noaa.ersst.v5/'
        }

downloadFileFTP(ERSSTv5_dict)

# =============================================================================
# NINO3.4 Index
# =============================================================================
예제 #12
0
Xorg = np.load(join(infodir,'Xorg.npy'))
# include values of 3 and 6 months previously
n_lags = 3
step = 3
X = include_time_lag(Xorg, n_lags = n_lags, step=step)
X = X[-1:,:] # now use only the latest observation to produce forecast


# =============================================================================
# For each lead time, load ensemble of models and make prediction
# =============================================================================

lead_times = np.load(join(infodir,'lead_times.npy'))
predictions = np.zeros((2,len(lead_times))) # first row: mean, second row: std

print_header("Making predictions")

for i in np.arange(len(lead_times)):
    print("Lead time "+str(lead_times[i])+" months")
    dem = DEM(layers=1, neurons = 32, dropout=0.05, noise_in=0.0, noise_sigma=0.,
                       noise_mu=0., l1_hidden=0.0, l2_hidden=0.,
                       l1_mu=0, l2_mu=0., l1_sigma=0,
                       l2_sigma=0.0, lr=0.01, batch_size=100,
                       epochs=5000, n_segments=5, n_members_segment=3, patience=25,
                       activation='tanh',
                       verbose=0, pdf="normal", name="gdnn_ex_pca")
    for j in decades[:-1]:
        dem.load(location=modeldir, dir_name = 'gdnn_ex_pca_decade'+str(j)+'_lead'+str(lead_times[i]))
    pred = dem.predict(X)
    predictions[0,i] = pred[0][0] # mean
    predictions[1,i] = pred[1][0] # std
label_unscaled = label.values.reshape(label.shape[0],-1)


scaler_f = StandardScaler()
Xorg = scaler_f.fit_transform(feature_unscaled)

scaler_l = StandardScaler()
yorg = scaler_l.fit_transform(label_unscaled)

Xall = np.nan_to_num(Xorg)
yall = np.nan_to_num(yorg)

# shift
shift = 3
for lead in [3, 6, 9, 12, 15, 0]:
    print_header(f'Lead time: {lead} month')

    y = yall[lead+shift:]
    X = Xall[:-lead-shift]
    timey = oni.index[lead+shift:]

    for decade in [60, 70, 80, 90, 100, 110]:
        print_header(f'Test period: {1902+decade}-01-01 till {1911+decade}-12-01')
        K.clear_session()

        # jump loop iteration if already trained
        ens_dir=f'ed_ensemble_decade{decade}_lead{lead}'
        out_dir = os.path.join(ed_model_dir, ens_dir)

        modified_time = time.gmtime(os.path.getmtime(out_dir))
        compare_time = time.strptime("15-7-2019 13:00 UTC", "%d-%m-%Y %H:%M %Z")
예제 #14
0
def cross_training(model, pipeline, n_iter, lead_times, **kwargs):
    """
    Training the model on different training sets in which each time a period\
    corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \
    observed date is spared.

    :param model: A model that follows the guidelines how a model object\
    should be set up.

    :param pipeline: a function that takes lead time as argument and returns\
    the corresponding feature, label, time and persistance.

    :param save_dir: The prefix of the save directory.

    :param **kwargs: Arguments that shell be passed to the .set_parameter() \
    method of the provided model.
    """

    for lead_time in lead_times:
        X, y, timey = pipeline(lead_time, return_persistance=False)

        print_header(f'Lead time: {lead_time} months')

        for j in range(n_decades - 1):
            m = model(**kwargs)
            dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}"
            path = join(modeldir, dir_name)

            n_files = 0
            if exists(path):
                n_files = len(listdir(path))

            if not exists(path) or n_files == 0:
                small_print_header(
                    f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01'
                )

                test_indeces = (timey >= f'{decades[j]}-01-01') & (
                    timey <= f'{decades[j+1]-1}-12-01')
                train_indeces = np.invert(test_indeces)
                trainX, trainy, traintime = X[
                    train_indeces, :], y[train_indeces], timey[train_indeces]

                m.fit_RandomizedSearch(trainX,
                                       trainy,
                                       traintime,
                                       n_iter=n_iter)
                m.save(location=modeldir, dir_name=dir_name)

            else:
                print(f'{dir_name} already exists')
            del m


# def cross_hindcast(model, pipeline, model_name, **kwargs):
#     """
#     Generate a hindcast from 1962 till today using the models which were
#     trained by the .cross_training() method.

#     :param model: The considered model.

#     :param pipeline: The data pipeline that already was used before in \
#     .cross_training().
#     """

#     first_lead_loop = True

#     for i in range(n_lead):
#         lead_time = lead_times[i]
#         print_header(f'Lead time: {lead_time} months')

#         X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True)

#         ytrue = np.array([])
#         timeytrue = pd.DatetimeIndex([])

#         first_dec_loop = True
#         for j in range(n_decades-1):
#             small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

#             # test indices
#             test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
#             testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces]

#             m = model(**kwargs)
#             m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}')

#             # allocate arrays and variables for which the model must be loaded
#             if first_dec_loop:
#                 n_outputs = m.n_outputs

#                 output_names = m.output_names
#                 pred_full = np.zeros((n_outputs, 0))
#                 first_dec_loop=False

#             # make prediction
#             pred = np.zeros((m.n_outputs, testX.shape[0]))
#             pred[:,:] = m.predict(testX)

#             # make the full time series
#             pred_full = np.append(pred_full, pred, axis=1)
#             ytrue = np.append(ytrue, testy)
#             timeytrue = timeytrue.append(testtimey)
#             del m

#         if timeytrue[0]!=pd.to_datetime('1963-01-01'):
#             expected_first_date = '1963-01-01'
#             got_first_date = timeytrue[0].isoformat()[:10]

#             raise Exception(f"The first predicted date for lead time {lead_time} \
#                             is {got_first_date} but expected {expected_first_date}")

#         # allocate arrays and variables for which the full length of the time
#         # series must be known
#         if first_lead_loop:
#             n_time = len(timeytrue)
#             pred_save =  np.zeros((n_outputs, n_time, n_lead))
#             first_lead_loop=False

#         pred_save[:,:,i] =  pred_full

#     # Save data to a netcdf file
#     save_dict = {}
#     for i in range(n_outputs):
#         save_dict[output_names[i]] = (['target_season', 'lead'],  pred_save[i,:,:])

#     ds = xr.Dataset(save_dict, coords={'target_season': timeytrue,
#                                        'lead': lead_times} )
#     ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc'))

# def cross_hindcast_dem(model, pipeline, model_name):
#     """
#     Generate a hindcast from 1962 till today using the models which were
#     trained by the .cross_training() method. ONLY works for the DEM.
#     This routine returns an std estimate that is only based on the corrlation
#     skill of the DEM predicted mean.

#     :param model: The considered model.

#     :param pipeline: The data pipeline that already was used before in \
#     .cross_training().
#     """
#     #cross_hindcast(model, pipeline, model_name)

#     std_estimate = xr.open_dataarray(join(processeddir, f'{model_name}_std_estimate.nc'))

#     first_lead_loop = True

#     for i in range(n_lead):
#         lead_time = lead_times[i]
#         print_header(f'Lead time: {lead_time} months')

#         X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True)

#         ytrue = np.array([])
#         timeytrue = pd.DatetimeIndex([])

#         first_dec_loop = True
#         for j in range(n_decades-1):
#             small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01')

#             # test indices
#             test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01')
#             testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces]

#             m = model()
#             m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}')

#             # allocate arrays and variables for which the model must be loaded
#             if first_dec_loop:
#                 n_outputs = m.n_outputs
#                 output_names = m.output_names
#                 pred_full = np.zeros((n_outputs+1, 0))
#                 first_dec_loop=False

#             # make prediction
#             pred = np.zeros((m.n_outputs+1, testX.shape[0]))
#             pred[:2,:] = m.predict(testX)

#             for k in range(len(testtimey)):
#                 month = testtimey[k].date().month
#                 pred[-1, k] = std_estimate[i, month-1]

#             # make the full time series
#             pred_full = np.append(pred_full, pred, axis=1)
#             ytrue = np.append(ytrue, testy)
#             timeytrue = timeytrue.append(testtimey)
#             del m

#         if timeytrue[0]!=pd.to_datetime('1963-01-01'):
#             expected_first_date = '1963-01-01'
#             got_first_date = timeytrue[0].isoformat()[:10]

#             raise Exception(f"The first predicted date for lead time {lead_time} \
#                             is {got_first_date} but expected {expected_first_date}")

#         # allocate arrays and variables for which the full length of the time
#         # series must be known
#         if first_lead_loop:
#             n_time = len(timeytrue)
#             pred_save =  np.zeros((n_outputs+1, n_time, n_lead))
#             first_lead_loop=False

#         pred_save[:,:,i] =  pred_full

#     # Save data to a netcdf file
#     save_dict = {}
#     for i in range(n_outputs + 1):
#         if i<n_outputs:
#             save_dict[output_names[i]] = (['target_season', 'lead'],  pred_save[i,:,:])
#         else:
#             save_dict['std_estimate'] = (['target_season', 'lead'],  pred_save[i,:,:])

#     ds = xr.Dataset(save_dict, coords={'target_season': timeytrue,
#                                        'lead': lead_times} )
#     ds.to_netcdf(join(processeddir, f'{model_name}_forecasts_with_std_estimated.nc'))
#     ds.close()