def generate_dataset(self, ahead=1, mode=None, ensemble=False, ens_slice=None, remote=None): """ Generates the dataset for training, test and validation 0 = One site - wind 1 = One site - all variables 2 = All sites - wind 3 = All sites - all variables 4 = All sites - all variables stacked 5 = Uses neighbor sites around a radius :param ens_slice: (not yet used) :param remote: Use remote data :param ensemble: (not yet used) :param datanames: Name of the wind datafiles :param ahead: number of steps ahead for prediction :param mode: type of dataset (pair indicating the type of dimension for input and output) :return: """ self.generated = True self.mode = mode datanames = self.config['datanames'] datasize = self.config['datasize'] testsize = self.config['testsize'] lag = self.config['lag'] vars = self.config['vars'] wind = {} if 'angle' in self.config: angle = self.config['angle'] else: angle = False #ahead = self.config['ahead'] if (type(self.config['ahead']) == list) else [1, self.config['ahead']] if type(ahead) == list: dahead = ahead[1] slice = (ahead[1] - ahead[0]) + 1 else: dahead = ahead slice = ahead # Augment the dataset with the closest neighbors if self.config['dataset'] == 5 or self.config['dataset'] == 31: if 'radius' not in self.config: raise NameError( "Radius missing for neighbours augmented dataset") else: radius = self.config['radius'] if 'nneighbors' in self.config: datanames = get_closest_k_neighbors(datanames[0], radius, self.config['nneighbors']) else: datanames = get_all_neighbors(datanames[0], radius) # Reads numpy arrays for all sites and keeps only selected columns for d in datanames: if remote: srv = pysftp.Connection(host=remote_data[0], username=remote_data[1]) srv.get(remote_wind_data_path + f"/{d}.npy", self.data_path + f"/{d}.npy") srv.close() if angle: wind[d] = np.load(self.data_path + '_angle' + f"/{d}.npy") else: wind[d] = np.load(self.data_path + f"/{d}.npy") if remote: os.remove(self.data_path + f"/{d}.npy") # If there is a list in vars attribute it should be a list of integers if type(vars) == list: for v in vars: if type(v) != int or v > wind[d].shape[1]: raise NameError('Error in variable selection') wind[d] = wind[d][:, vars] if (self.config['dataset'] == 0) or (self.config['dataset'] == 'onesiteonevar'): if not ensemble: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_one_var(wind[datanames[0]][:, 0].reshape(-1, 1), datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) else: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_one_var(wind[datanames[0]][ens_slice[0]::ens_slice[1], 0].reshape(-1, 1), datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif (self.config['dataset'] == 1) or (self.config['dataset'] == 'onesitemanyvar'): if not ensemble: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(wind[datanames[0]], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) else: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(wind[datanames[0][ens_slice[0]::ens_slice[1], :]], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif self.config['dataset'] == 2 or self.config[ 'dataset'] == 'manysiteonevar': stacked = np.vstack([wind[d][:, 0] for d in datanames]).T self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(stacked, datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif self.config['dataset'] == 3 or self.config[ 'dataset'] == 31 or self.config['dataset'] == 'manysitemanyvar': stacked = np.hstack([wind[d] for d in datanames]) self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(stacked, datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif self.config['dataset'] == 4 or self.config['dataset'] == 5 or \ self.config['dataset'] == 'manysitemanyvarstack': stacked = [ self._generate_dataset_multiple_var(wind[d], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) for d in datanames ] self.train_x = np.vstack([x[0] for x in stacked]) self.train_y = np.vstack([x[1] for x in stacked]) self.val_x = stacked[0][2] self.val_y = stacked[0][3] self.test_x = stacked[0][4] self.test_y = stacked[0][5] else: raise NameError('ERROR: No such dataset type')
def generate_dataset(self, ahead=1, mode=None, ensemble=False, ens_slice=None, remote=None): """ Generates the dataset for training, test and validation 0 = One site - wind 1 = One site - all variables 2 = All sites - wind 3 = All sites - all variables 4 = All sites - all variables stacked 5 = Uses neighbor sites around a radius 6 = Uses random sites outside a radius :param ens_slice: (not yet used) :param remote: Use remote data :param ensemble: (not yet used) :param datanames: Name of the wind datafiles :param ahead: number of steps ahead for prediction :param mode: type of dataset (pair indicating the type of dimension for input and output) :return: """ self.generated = True self.mode = mode datanames = self.config['datanames'] datasize = self.config['datasize'] testsize = self.config['testsize'] lag = self.config['lag'] vars = self.config['vars'] period = self.config['period'] if 'period' in self.config else None wind = {} if 'angle' in self.config: angle = self.config['angle'] else: angle = False # ahead = self.config['ahead'] if (type(self.config['ahead']) == list) else [1, self.config['ahead']] if type(ahead) == list: dahead = ahead[1] slice = (ahead[1] - ahead[0]) + 1 else: dahead = ahead slice = ahead # Augment the dataset with the closest neighbors if self.config['dataset'] == 5 or self.config['dataset'] == 31: if 'radius' not in self.config: raise NameError( "Radius missing for neighbours augmented dataset") else: radius = self.config['radius'] if 'nneighbors' in self.config: datanames = get_closest_k_neighbors(datanames[0], radius, self.config['nneighbors']) else: datanames = get_all_neighbors(datanames[0], radius) # Augment the dataset with the random not neighbors (out of a radius) if self.config['dataset'] == 6: if 'radius' not in self.config: raise NameError( "Radius missing for neighbours augmented dataset") else: radius = self.config['radius'] nonneigh = 100 if 'nonneighbors' not in self.config else self.config[ 'nonneighbors'] nndnames = get_random_k_nonneighbors(datanames[0], radius, nonneigh) # print(nndnames) datanames.extend(nndnames) # Reads numpy arrays for all sites and keeps only selected columns for d in datanames: if remote: srv = pysftp.Connection(host=remote_data[0], username=remote_data[1]) srv.get(remote_wind_data_path + f"/{d}.npy", self.data_path + f"/{d}.npy") srv.close() if angle: wind[d] = np.load(self.data_path + '_angle' + f"/{d}.npy") else: wind[d] = np.load(self.data_path + f"/{d}.npy") if remote: os.remove(self.data_path + f"/{d}.npy") # If there is a list in vars attribute it should be a list of integers if type(vars) == list: for v in vars: if type(v) != int or v > wind[d].shape[1]: raise NameError('Error in variable selection') wind[d] = wind[d][:, vars] # If the period flag is on we add sinusoidal variables to the data with period a day and a year if period is not None: day = np.zeros((wind[d].shape[0], 1)) freq = int(24 * 60 / period) for i in range(freq): day[i::freq] = np.sin((2 * np.pi / freq) * i) # print(day.shape) year = np.zeros((wind[d].shape[0], 1)) freq = int(365 * 24 * 60 / period) for i in range(freq): year[i::freq] = np.sin((2 * np.pi / freq) * i) # print(year.shape) # print(wind[d].shape) wind[d] = np.concatenate((wind[d], day, year), axis=1) if (self.config['dataset'] == 0) or (self.config['dataset'] == 'onesiteonevar'): if not ensemble: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_one_var(wind[datanames[0]][:, 0].reshape(-1, 1), datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) else: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_one_var(wind[datanames[0]][ens_slice[0]::ens_slice[1], 0].reshape(-1, 1), datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif (self.config['dataset'] == 1) or (self.config['dataset'] == 'onesitemanyvar'): if not ensemble: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(wind[datanames[0]], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) else: self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(wind[datanames[0][ens_slice[0]::ens_slice[1], :]], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif self.config['dataset'] == 2 or self.config[ 'dataset'] == 'manysiteonevar': stacked = np.vstack([wind[d][:, 0] for d in datanames]).T self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(stacked, datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif self.config['dataset'] == 3 or self.config[ 'dataset'] == 31 or self.config['dataset'] == 'manysitemanyvar': stacked = np.hstack([wind[d] for d in datanames]) self.train_x, self.train_y, self.val_x, self.val_y, self.test_x, self.test_y = \ self._generate_dataset_multiple_var(stacked, datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) elif self.config['dataset'] == 4 or self.config['dataset'] == 5 or \ self.config['dataset'] == 'manysitemanyvarstack': stacked = [ self._generate_dataset_multiple_var(wind[d], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) for d in datanames ] self.train_x = np.vstack([x[0] for x in stacked]) self.train_y = np.vstack([x[1] for x in stacked]) self.val_x = stacked[0][2] self.val_y = stacked[0][3] self.test_x = stacked[0][4] self.test_y = stacked[0][5] # Training augmenting the dataset with random sites outside a radius elif self.config['dataset'] == 6: stacked = [ self._generate_dataset_multiple_var(wind[d], datasize, testsize, lag=lag, ahead=dahead, slice=slice, mode=mode) for d in datanames ] # Training with all the sites self.train_x = np.vstack([x[0] for x in stacked]) self.train_y = np.vstack([x[1] for x in stacked]) # Testing and validating only with the experiment site self.val_x = stacked[0][2] self.val_y = stacked[0][3] self.test_x = stacked[0][4] self.test_y = stacked[0][5] else: raise NameError('ERROR: No such dataset type')