def cross_training(model, pipeline, n_iter, **kwargs): """ Training the model on different training sets in which each time a period\ corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \ ovserved date is spared. :param model: A model that follows the guidelines how a model object\ should be set up. :param pipeline: a function that takes lead time as argument and returns\ the corresponding feature, label, time and persistance. :param save_dir: The prefix of the save directory. :param **kwargs: Arguments that shell be passed to the .set_parameter() \ method of the provided model. """ for lead_time in lead_times: X, y, timey = pipeline(lead_time, return_persistance=False) print_header(f'Lead time: {lead_time} month') for j in range(n_decades-1): m = model(**kwargs) dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}" path = join(modeldir, dir_name) n_files=0 if exists(path): n_files = len(listdir(path)) if not exists(path) or n_files==0: small_print_header(f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01') test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01') train_indeces = np.invert(test_indeces) trainX, trainy, traintime = X[train_indeces,:], y[train_indeces], timey[train_indeces] m.fit_RandomizedSearch(trainX, trainy, traintime, n_iter=n_iter) m.save(location=modeldir, dir_name=dir_name) else: print(f'{dir_name} already exists') del m
from data_pipeline import pipeline import os import time plt.close("all") K.clear_session() #%% ============================================================================= # Deep ensemble # ============================================================================= decades = [60, 70, 80, 90, 100, 110] for lead_time in [0, 3, 6, 9, 12, 15]: X, y, timey, yp = pipeline(lead_time, return_persistance=True) print_header(f'Lead time: {lead_time} month') for decade in decades: small_print_header( f'Test period: {1902+decade}-01-01 till {1911+decade}-12-01') # jump loop iteration if already trained ens_dir = f'ensemble_decade{decade}_lead{lead_time}' out_dir = os.path.join(modeldir, ens_dir) modified_time = time.gmtime(os.path.getmtime(out_dir)) compare_time = time.strptime("21-7-2019 13:00 UTC", "%d-%m-%Y %H:%M %Z") if modified_time > compare_time: print("Trained already!")
def cross_hindcast(model, pipeline, model_name): """ Generate a hindcast from 1962 till today using the models which were trained by the .cross_training() method. :param model: The considered model. :param pipeline: The data pipeline that already was used before in \ .cross_training(). """ first_lead_loop = True for i in range(n_lead): lead_time = lead_times[i] print_header(f'Lead time: {lead_time} months') X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True) ytrue = np.array([]) timeytrue = pd.DatetimeIndex([]) first_dec_loop = True for j in range(n_decades - 1): small_print_header( f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01') # test indices test_indeces = (timey >= f'{decades[j]}-01-01') & ( timey <= f'{decades[j+1]-1}-12-01') testX, testy, testtimey = X[ test_indeces, :], y[test_indeces], timey[test_indeces] m = model() m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}') # allocate arrays and variables for which the model must be loaded if first_dec_loop: n_outputs = m.n_outputs output_names = m.output_names pred_full = np.zeros((n_outputs, 0)) first_dec_loop = False # make prediction pred = np.zeros((m.n_outputs, testX.shape[0])) pred[:, :] = m.predict(testX) # make the full time series pred_full = np.append(pred_full, pred, axis=1) ytrue = np.append(ytrue, testy) timeytrue = timeytrue.append(testtimey) del m if timeytrue[0] != pd.to_datetime('1963-01-01'): expected_first_date = '1963-01-01' got_first_date = timeytrue[0].isoformat()[:10] raise Exception( f"The first predicted date for lead time {lead_time} \ is {got_first_date} but expected {expected_first_date}" ) # allocate arrays and variables for which the full length of the time # series must be known if first_lead_loop: n_time = len(timeytrue) pred_save = np.zeros((n_outputs, n_time, n_lead)) first_lead_loop = False pred_save[:, :, i] = pred_full # Save data to a netcdf file save_dict = {} for i in range(n_outputs): save_dict[output_names[i]] = (['target_season', 'lead'], pred_save[i, :, :]) ds = xr.Dataset(save_dict, coords={ 'target_season': timeytrue, 'lead': lead_times }) ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc'))
lr=[0.0001, 0.01], batch_size=100, epochs=500, n_segments=5, n_members_segment=1, patience=30, verbose=0, pdf='normal') model.fit_RandomizedSearch(trainX, trainy, n_iter=100) #%% pred_mean, pred_std = model.predict(testX) loss = model.evaluate(testy, pred_mean, pred_std) print_header(f"Loss: {loss}") if model.pdf == "normal": ens_dir = f'ensemble_lead{lead_time}' elif model.pdf == "skewed": ens_dir = f'ensemble_skew_lead{lead_time}' else: ens_dir = f'simple_ensemble_lead{lead_time}' model.save(location=modeldir, dir_name=ens_dir) #%% ============================================================================= # Plots # ============================================================================= plt.close("all")
The downloaded data needed to be prepared to have the similiar time-axis. All spatial data is regridded to the 2.5x2.5 grid of the NCEP reanalysis data. Some variables are computed, i.e the wind stress field, the wind speed and the warm pool edge. """ import numpy as np from ninolearn.utils import print_header from ninolearn.preprocess.prepare import prep_oni, prep_nino_month, prep_wwv from ninolearn.preprocess.prepare import prep_iod, prep_K_index, prep_wwv_proxy from ninolearn.preprocess.prepare import calc_warm_pool_edge, prep_other_forecasts print_header("Prepare Data") # ============================================================================= # Prepare the incedes # ============================================================================= prep_oni() prep_nino_month(index="3.4") prep_nino_month(index="3") prep_nino_month(index="1+2") prep_nino_month(index="4") prep_wwv() prep_wwv(cardinal_direction="west") prep_iod() prep_K_index() prep_wwv_proxy()
from ninolearn.utils import print_header from ninolearn.preprocess.network import networkMetricsSeries print_header("Network Metrics") nms_ssh_godas = networkMetricsSeries('sshg', 'GODAS', processed="anom", threshold=0.9, startyear=1980, endyear=2018, window_size=12, lon_min=120, lon_max=280, lat_min=-30, lat_max=30, verbose=1) nms_ssh_godas.computeTimeSeries() nms_ssh_oras4 = networkMetricsSeries('zos', 'ORAS4', processed="anom", threshold=0.9, startyear=1959, endyear=2017, window_size=12, lon_min=120, lon_max=280, lat_min=-30, lat_max=30, verbose=1)
def fit_RandomizedSearch(self, trainX, trainy, n_iter=10, **kwargs): """ Hyperparameter optimazation using random search. :type trainX: np.ndarray :param trainX: The training feature set. 2-D array with dimensions\ (timesteps, features). :type trainy: np.ndarray :param trainy: The training label set. 2-D array with dimensions\ (timesteps, labels). :param kwargs: Keyword arguments are passed to the .fit() method. """ # check if hyperparameters where provided in lists for randomized search if len(self.hyperparameters_search) == 0: raise Exception("No variable indicated for hyperparameter search!") #iterate with randomized hyperparameters best_loss = np.inf for i in range(n_iter): print_header(f"Search iteration Nr {i+1}/{n_iter}") # random selection of hyperparameters for key in self.hyperparameters_search.keys(): low = self.hyperparameters_search[key][0] high = self.hyperparameters_search[key][1] if type(low) is float and type(high) is float: self.hyperparameters[key] = np.random.uniform(low, high) if type(low) is int and type(high) is int: self.hyperparameters[key] = np.random.randint(low, high+1) if type(low) is tuple and type(high) is tuple: hyp_list = [] for i in range(len(low)): hyp_list.append(np.random.randint(low[i], high[i]+1)) self.hyperparameters[key] = tuple(hyp_list) self.fit(trainX, trainy, **kwargs) # check if validation score was enhanced if self.mean_val_loss<best_loss: best_loss = self.mean_val_loss self.best_hyperparameters = self.hyperparameters.copy() small_print_header("New best hyperparameters") print(f"Mean loss: {best_loss}") print(self.best_hyperparameters) # refit the model with optimized hyperparameter # AND to have the weights of the DE for the best hyperparameters again print_header("Refit the model with best hyperparamters") self.hyperparameters = self.best_hyperparameters.copy() print(self.hyperparameters) self.fit(trainX, trainy, **kwargs) print(f"best loss search: {best_loss}") print(f"loss refitting : {self.mean_val_loss}")
lr=0.001, batch_size=1, epochs=500, n_segments=5, n_members_segment=1, patience=30, verbose=1, std=True) model.fit(trainX, trainy, testX, testy, use_pretrained=False) #%% pred_mean, pred_std = model.predict(testX) score = model.evaluate(testy, pred_mean, pred_std) print_header(f"Score: {score}") if model.std: ens_dir = f'ensemble_lead{lead_time}' else: ens_dir = f'simple_ensemble_lead{lead_time}' model.save(location=modeldir, dir_name=ens_dir) #%% ============================================================================= # Plots # ============================================================================= plt.close("all") # Scores during trianing plt.subplots()
# preprocess data feature_unscaled = feature.values.reshape(feature.shape[0], -1) label_unscaled = label.values.reshape(label.shape[0], -1) scaler_f = StandardScaler() Xorg = scaler_f.fit_transform(feature_unscaled) scaler_l = StandardScaler() yorg = scaler_l.fit_transform(label_unscaled) Xall = np.nan_to_num(Xorg) yall = np.nan_to_num(yorg) shift = 3 lead = 9 print_header(f'Lead time: {lead} months') y = yall[lead + shift:] X = Xall[:-lead - shift] timey = oni.index[lead + shift:] y_nino = oni[lead + shift:] pred_full_oni = np.array([]) true_oni = np.array([]) timeytrue = pd.DatetimeIndex([]) pred_da_full = xr.zeros_like(label[lead + shift:, :, :]) for j in range(n_decades):
""" The following script downloads all data that was relevant for my master thesis. """ from ninolearn.download import download, sources from ninolearn.utils import print_header print_header("Download Data") #%% # ============================================================================= # Single files # ============================================================================= download(sources.SST_ERSSTv5) download(sources.ONI) download(sources.NINOindeces) download(sources.IOD) download(sources.HCA) download(sources.OLR_NOAA) download(sources.WWV) download(sources.WWV_West) download(sources.UWIND_NCEP) download(sources.VWIND_NCEP) download(sources.VWIND_NCEP) download(sources.SAT_monthly_NCEP) download(sources.otherForecasts) # ============================================================================= # Multiple files # ============================================================================= for i in range(1958, 2018):
import numpy as np from ninolearn.download import downloadFileFTP, downloadFileHTTP, download from ninolearn.private import CMEMS_password, CMEMS_username from ninolearn.utils import print_header from ninolearn.sources import SST_ERSSTv5 # ============================================================================= # ============================================================================= # # Download # ============================================================================= # ============================================================================= print_header("Download Data") download(SST_ERSSTv5) #%% ============================================================================= # ERSSTv5 # ============================================================================= ERSSTv5_dict = { 'filename': 'sst.mnmean.nc', 'host': 'ftp.cdc.noaa.gov', 'location': '/Datasets/noaa.ersst.v5/' } downloadFileFTP(ERSSTv5_dict) # ============================================================================= # NINO3.4 Index # =============================================================================
Xorg = np.load(join(infodir,'Xorg.npy')) # include values of 3 and 6 months previously n_lags = 3 step = 3 X = include_time_lag(Xorg, n_lags = n_lags, step=step) X = X[-1:,:] # now use only the latest observation to produce forecast # ============================================================================= # For each lead time, load ensemble of models and make prediction # ============================================================================= lead_times = np.load(join(infodir,'lead_times.npy')) predictions = np.zeros((2,len(lead_times))) # first row: mean, second row: std print_header("Making predictions") for i in np.arange(len(lead_times)): print("Lead time "+str(lead_times[i])+" months") dem = DEM(layers=1, neurons = 32, dropout=0.05, noise_in=0.0, noise_sigma=0., noise_mu=0., l1_hidden=0.0, l2_hidden=0., l1_mu=0, l2_mu=0., l1_sigma=0, l2_sigma=0.0, lr=0.01, batch_size=100, epochs=5000, n_segments=5, n_members_segment=3, patience=25, activation='tanh', verbose=0, pdf="normal", name="gdnn_ex_pca") for j in decades[:-1]: dem.load(location=modeldir, dir_name = 'gdnn_ex_pca_decade'+str(j)+'_lead'+str(lead_times[i])) pred = dem.predict(X) predictions[0,i] = pred[0][0] # mean predictions[1,i] = pred[1][0] # std
label_unscaled = label.values.reshape(label.shape[0],-1) scaler_f = StandardScaler() Xorg = scaler_f.fit_transform(feature_unscaled) scaler_l = StandardScaler() yorg = scaler_l.fit_transform(label_unscaled) Xall = np.nan_to_num(Xorg) yall = np.nan_to_num(yorg) # shift shift = 3 for lead in [3, 6, 9, 12, 15, 0]: print_header(f'Lead time: {lead} month') y = yall[lead+shift:] X = Xall[:-lead-shift] timey = oni.index[lead+shift:] for decade in [60, 70, 80, 90, 100, 110]: print_header(f'Test period: {1902+decade}-01-01 till {1911+decade}-12-01') K.clear_session() # jump loop iteration if already trained ens_dir=f'ed_ensemble_decade{decade}_lead{lead}' out_dir = os.path.join(ed_model_dir, ens_dir) modified_time = time.gmtime(os.path.getmtime(out_dir)) compare_time = time.strptime("15-7-2019 13:00 UTC", "%d-%m-%Y %H:%M %Z")
def cross_training(model, pipeline, n_iter, lead_times, **kwargs): """ Training the model on different training sets in which each time a period\ corresponing to a decade out of 1962-1971, 1972-1981, ..., 2012-last \ observed date is spared. :param model: A model that follows the guidelines how a model object\ should be set up. :param pipeline: a function that takes lead time as argument and returns\ the corresponding feature, label, time and persistance. :param save_dir: The prefix of the save directory. :param **kwargs: Arguments that shell be passed to the .set_parameter() \ method of the provided model. """ for lead_time in lead_times: X, y, timey = pipeline(lead_time, return_persistance=False) print_header(f'Lead time: {lead_time} months') for j in range(n_decades - 1): m = model(**kwargs) dir_name = f"{m.hyperparameters['name']}_decade{decades[j]}_lead{lead_time}" path = join(modeldir, dir_name) n_files = 0 if exists(path): n_files = len(listdir(path)) if not exists(path) or n_files == 0: small_print_header( f'Test period: {decades[j]}-01-01 till {decades[j+1]-1}-12-01' ) test_indeces = (timey >= f'{decades[j]}-01-01') & ( timey <= f'{decades[j+1]-1}-12-01') train_indeces = np.invert(test_indeces) trainX, trainy, traintime = X[ train_indeces, :], y[train_indeces], timey[train_indeces] m.fit_RandomizedSearch(trainX, trainy, traintime, n_iter=n_iter) m.save(location=modeldir, dir_name=dir_name) else: print(f'{dir_name} already exists') del m # def cross_hindcast(model, pipeline, model_name, **kwargs): # """ # Generate a hindcast from 1962 till today using the models which were # trained by the .cross_training() method. # :param model: The considered model. # :param pipeline: The data pipeline that already was used before in \ # .cross_training(). # """ # first_lead_loop = True # for i in range(n_lead): # lead_time = lead_times[i] # print_header(f'Lead time: {lead_time} months') # X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True) # ytrue = np.array([]) # timeytrue = pd.DatetimeIndex([]) # first_dec_loop = True # for j in range(n_decades-1): # small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01') # # test indices # test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01') # testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces] # m = model(**kwargs) # m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}') # # allocate arrays and variables for which the model must be loaded # if first_dec_loop: # n_outputs = m.n_outputs # output_names = m.output_names # pred_full = np.zeros((n_outputs, 0)) # first_dec_loop=False # # make prediction # pred = np.zeros((m.n_outputs, testX.shape[0])) # pred[:,:] = m.predict(testX) # # make the full time series # pred_full = np.append(pred_full, pred, axis=1) # ytrue = np.append(ytrue, testy) # timeytrue = timeytrue.append(testtimey) # del m # if timeytrue[0]!=pd.to_datetime('1963-01-01'): # expected_first_date = '1963-01-01' # got_first_date = timeytrue[0].isoformat()[:10] # raise Exception(f"The first predicted date for lead time {lead_time} \ # is {got_first_date} but expected {expected_first_date}") # # allocate arrays and variables for which the full length of the time # # series must be known # if first_lead_loop: # n_time = len(timeytrue) # pred_save = np.zeros((n_outputs, n_time, n_lead)) # first_lead_loop=False # pred_save[:,:,i] = pred_full # # Save data to a netcdf file # save_dict = {} # for i in range(n_outputs): # save_dict[output_names[i]] = (['target_season', 'lead'], pred_save[i,:,:]) # ds = xr.Dataset(save_dict, coords={'target_season': timeytrue, # 'lead': lead_times} ) # ds.to_netcdf(join(processeddir, f'{model_name}_forecasts.nc')) # def cross_hindcast_dem(model, pipeline, model_name): # """ # Generate a hindcast from 1962 till today using the models which were # trained by the .cross_training() method. ONLY works for the DEM. # This routine returns an std estimate that is only based on the corrlation # skill of the DEM predicted mean. # :param model: The considered model. # :param pipeline: The data pipeline that already was used before in \ # .cross_training(). # """ # #cross_hindcast(model, pipeline, model_name) # std_estimate = xr.open_dataarray(join(processeddir, f'{model_name}_std_estimate.nc')) # first_lead_loop = True # for i in range(n_lead): # lead_time = lead_times[i] # print_header(f'Lead time: {lead_time} months') # X, y, timey, y_persistance = pipeline(lead_time, return_persistance=True) # ytrue = np.array([]) # timeytrue = pd.DatetimeIndex([]) # first_dec_loop = True # for j in range(n_decades-1): # small_print_header(f'Predict: {decades[j]}-01-01 till {decades[j+1]-1}-12-01') # # test indices # test_indeces = (timey>=f'{decades[j]}-01-01') & (timey<=f'{decades[j+1]-1}-12-01') # testX, testy, testtimey = X[test_indeces,:], y[test_indeces], timey[test_indeces] # m = model() # m.load(location=modeldir, dir_name=f'{model_name}_decade{decades[j]}_lead{lead_time}') # # allocate arrays and variables for which the model must be loaded # if first_dec_loop: # n_outputs = m.n_outputs # output_names = m.output_names # pred_full = np.zeros((n_outputs+1, 0)) # first_dec_loop=False # # make prediction # pred = np.zeros((m.n_outputs+1, testX.shape[0])) # pred[:2,:] = m.predict(testX) # for k in range(len(testtimey)): # month = testtimey[k].date().month # pred[-1, k] = std_estimate[i, month-1] # # make the full time series # pred_full = np.append(pred_full, pred, axis=1) # ytrue = np.append(ytrue, testy) # timeytrue = timeytrue.append(testtimey) # del m # if timeytrue[0]!=pd.to_datetime('1963-01-01'): # expected_first_date = '1963-01-01' # got_first_date = timeytrue[0].isoformat()[:10] # raise Exception(f"The first predicted date for lead time {lead_time} \ # is {got_first_date} but expected {expected_first_date}") # # allocate arrays and variables for which the full length of the time # # series must be known # if first_lead_loop: # n_time = len(timeytrue) # pred_save = np.zeros((n_outputs+1, n_time, n_lead)) # first_lead_loop=False # pred_save[:,:,i] = pred_full # # Save data to a netcdf file # save_dict = {} # for i in range(n_outputs + 1): # if i<n_outputs: # save_dict[output_names[i]] = (['target_season', 'lead'], pred_save[i,:,:]) # else: # save_dict['std_estimate'] = (['target_season', 'lead'], pred_save[i,:,:]) # ds = xr.Dataset(save_dict, coords={'target_season': timeytrue, # 'lead': lead_times} ) # ds.to_netcdf(join(processeddir, f'{model_name}_forecasts_with_std_estimated.nc')) # ds.close()