def get_train_test(test_start, test_stop, model = 'ar'): print('gets train test data for ar model') if type == 'ar': files_train = get_list_of_files_excluding_period(test_start, test_stop) files_test = get_list_of_files(test_start, test_stop) else: files_train = get_list_of_files_excluding_period_traditional_model(test_start, test_stop) files_test = get_list_of_files_traditional_model(test_start, test_stop) print('Detected {} train files and {} test files. Merging might take a while .... '.format(len(files_train), len(files_test))) train_dataset = merge(files_train) print('finished merging train') test_dataset = merge(files_test) return train_dataset, test_dataset
def load(self, lat, lon): files = get_list_of_files(start=self.start, stop=self.stop) print("Loading {} files.".format(len(files))) self.dataset = merge(files) # Move some of this to the dataloader part? means = np.zeros( (len(self.latitude), len(self.longitude), len(self.variables))) stds = np.zeros( (len(self.latitude), len(self.longitude), len(self.variables))) for i, lat in enumerate(self.latitude): # 81 for j, lon in enumerate(self.longitude): # 161 ds = get_pixel_from_ds(self.dataset, lat, lon) X, y = dataset_to_numpy(ds, bias=self.bias) print('Number of samples prior to removal of nans {}.'.format( len(y))) # Removes nan's a = np.concatenate([X, y], axis=1) a = a[~np.isnan(a).any(axis=1)] # This is where you do the transformation X = a[:, :-1] y = a[:, -1, np.newaxis] self.mean = means self.std = stds return X, y
def load(self): """ Includes the nans that are present. They are disregarded in fit() and included in transform(). """ from sklearn.preprocessing import MinMaxScaler files = get_list_of_files(start=self.start, stop=self.stop) print("Loading {} files.".format(len(files))) self.dataset = merge(files) # Move some of this to the dataloader part? mins = np.zeros( (len(self.latitude), len(self.longitude), len(self.variables))) maxs = np.zeros( (len(self.latitude), len(self.longitude), len(self.variables))) for i, lat in enumerate(self.latitude): # 81 for j, lon in enumerate(self.longitude): # 161 # Move some of this to the dataloader part? ds = get_pixel_from_ds(self.dataset, lat, lon) X, y = dataset_to_numpy(ds, bias=self.bias) print('Number of samples prior to removal of nans {}.'.format( len(y))) # This is where you do the transformation minmax = MinMaxScaler() X[:, i, j, :] = minmax.fit_transform(pX) mins[i, j, :] = minmax.data_min_.flatten() maxs[i, j, :] = minmax.data_max_.flatten() # where do you intent to store the transformed data of uneven lengths self.min = mins self.max = maxs return X, y
def get_train_test(test_start, test_stop, model='ar'): """Loads train and test data to datasets ... """ #logger.info('Retrives data') if type == 'ar': files_train = get_list_of_files_excluding_period(test_start, test_stop) files_test = get_list_of_files(test_start, test_stop) else: files_train = get_list_of_files_excluding_period_traditional_model( test_start, test_stop) files_test = get_list_of_files_traditional_model(test_start, test_stop) #logger.info('Detected the relevant files. ') train_dataset = merge(files_train) #logger.info('Merged training data for {} to {}'.format(test_start, #test_stop)) test_dataset = merge(files_test) #logger.info('Merged test data for {} to {}'.format(test_start, test_stop)) return train_dataset, test_dataset
def fit(self): """ New fit function """ if self.test_start is not None and self.test_stop is not None: # Load test data # print('Loads test data') files = get_list_of_files_traditional_model(start=self.test_start, stop=self.test_stop, include_start=True, include_stop=True) self.test_dataset = merge(files) num_vars = self.bias + len(self.variables) + self.order coeff_matrix = np.zeros( (len(self.latitude), len(self.longitude), num_vars)) mse_storage = np.zeros((len(self.latitude), len(self.longitude))) r2_storage = np.zeros((len(self.latitude), len(self.longitude))) ase_storage = np.zeros((len(self.latitude), len(self.longitude))) num_train_samples = np.zeros((len(self.latitude), len(self.longitude))) num_test_samples = np.zeros((len(self.latitude), len(self.longitude))) for i, lat in enumerate(self.latitude): # 81 for j, lon in enumerate(self.longitude): # 161 if self.transform: coeff, mse, ase, r2, num_test, num_train = self.load_transform_fit( lat, lon) coeff_matrix[i, j, :] = coeff mse_storage[i, j] = mse r2_storage[i, j] = r2 ase_storage[i, j] = ase num_train_samples[i, j] = num_test num_test_samples[i, j] = num_train else: raise NotImplementedError('Implement this shit .... ') print('Finished with pixel {}/{}'.format((i + 1) * j, 81 * 161)) self.coeff_matrix = coeff_matrix self.mse = mse_storage self.ase = ase_storage self.r2 = r2_storage self.num_test_samples = num_test_samples self.num_train_samples = num_train_samples return
def load_pixel(self, lat, lon): files = get_list_of_files(start=self.start, stop=self.stop) print("Loading {} files.".format(len(files))) self.dataset = merge(files) # Move some of this to the dataloader part? # TODO: This loops over lat lons ds = get_pixel_from_ds(self.dataset, lat, lon) X, y = dataset_to_numpy(ds, bias=self.bias) print('Number of samples prior to removal of nans {}.'.format(len(y))) # Removes nan's a = np.concatenate([X, y], axis=1) a = a[~np.isnan(a).any(axis=1)] # This is where you do the transformation X = a[:, :-1] y = a[:, -1, np.newaxis] # not tested return X, y
def get_xarray_dataset_for_period(start='2012-01-01', stop='2012-01-31'): """ Reads data from the requested period into a xarray dataset. I stop is not provided it defaults to one month of data. Parameteres ---------------------- start : str Start of period. First day included. (default '2012-01-01') stop : str, optional end of period. Last day included. (default '2012-01-31') Returns ----------------------- data : xr.Dataset Dataset including all variables in the requested period. """ #from utils import merge files = get_list_of_files(start=start, stop=stop) #print("Num files {}".format(len(files))) data = merge(files) if stop is not None: data = data.sel(time=slice(start, stop)) return data
def __init__(self, start=None, stop=None, test_start=None, test_stop=None, order=1, transform=False, sigmoid=False): """ Parameters ---------- start : str Start date of training/ fitting the model. Format: year-month-date stop : str Stop date of training/ fitting the model. Format: year-month-date test_start : str, optional Start date of test data, used in evaluation of the model. test_stop : str, optional Stop date of test data, used in evaluation of the model. order : int Number of previos time steps included as predictors transformer : bool, default = True Whether to standardize the data or not. sigmoid : bool, default = False Desides if you should siogmoid transform the response. """ if stop is not None: assert start < stop, "Start {} need to be prior to stop {}".format( start, stop) self.start = start self.stop = stop self.test_start = test_start self.test_stop = test_stop if ((start is None and stop is None) and (test_start is not None and test_stop is not None)): files = get_list_of_files_excluding_period(test_start, test_stop) self.dataset = merge(files) elif ((start is None and stop is None) and (test_start is None and test_stop is None)): raise ValueError('Something is wrong with') else: # Based on start and stop descide which files it gets. self.dataset = get_xarray_dataset_for_period(start=self.start, stop=self.stop) print('Finished loaded the dataset') self.order = order self.longitude = self.dataset.longitude.values self.latitude = self.dataset.latitude.values self.variables = [] #get_list_of_variables_in_ds(self.dataset) self.test_dataset = None self.coeff_matrix = None self.evaluate_ds = None self.mse = None self.r2 = None self.ase = None self.num_test_samples = None self.num_train_samples = None self.transform = transform self.sigmoid = sigmoid # Initialize containers if data should be transformed if self.transform: """ Read transformation from the correct folder in lagringshotellet """ self.mean = np.zeros((len(self.latitude), len(self.longitude), len(self.variables) + self.order)) self.std = np.zeros((len(self.latitude), len(self.longitude), len(self.variables) + self.order)) self.bias = False else: self.bias = True self.X_train = None self.y_train = None return
def __init__(self, start = None, stop = None, test_start = None, test_stop = None, train_dataset = None, test_dataset = None, order = 1, transform = False, sigmoid = False, latitude = None, longitude = None, type = 'ar', bias = True): """ Parameters ---------- start : str Start date of training/ fitting the model. Format: year-month-date stop : str Stop date of training/ fitting the model. Format: year-month-date test_start : str, optional Start date of test data, used in evaluation of the model. test_stop : str, optional Stop date of test data, used in evaluation of the model. order : int Number of previos time steps included as predictors transformer : bool, default = True Whether to standardize the data or not. sigmoid : bool, default = False Desides if you should siogmoid transform the response. """ if stop is not None: assert start < stop, "Start {} need to be prior to stop {}".format(start, stop) self.start = start self.stop = stop self.test_start = test_start self.test_stop = test_stop self.timer_start = timer() self.type = type if train_dataset is not None: print('Sets training data .... ') self.dataset = train_dataset else: if((start is None and stop is None) and (test_start is not None and test_stop is not None) ): if type == 'ar': files = get_list_of_files_excluding_period(test_start, test_stop) else: files = get_list_of_files_excluding_period_traditional_model(test_start, test_stop) #print('Detected {} files .. Merging might take a while ... '.format(len(files))) self.dataset = merge(files) elif((start is None and stop is None) and (test_start is None and test_stop is None) ): raise ValueError('Something is wrong with') else: # Based on start and stop descide which files it gets. if type == 'ar': files = get_list_of_files(test_start, test_stop) else: files = get_list_of_files_traditional_model(test_start, test_stop) #print('Detected {} files .. Merging might take a while ... '.format(len(files))) self.dataset = merge(files) if test_dataset is not None: print('Sets test data') self.test_dataset = test_dataset else: if self.test_start is not None and self.test_stop is not None: # Load test data print('Loads test data -- this should happen') if self.type == 'ar': files = get_list_of_files(start = self.test_start, stop = self.test_stop, include_start = True, include_stop = True) else: files = get_list_of_files_traditional_model(start = self.test_start, stop = self.test_stop, include_start = True, include_stop = True) self.test_dataset = merge(files) print('Finished loaded the dataset') self.order = order if latitude is None: self.latitude = self.dataset.latitude.values print('sets latitude values to be {}'.format(self.latitude)) else: self.latitude = np.array(latitude) print('sets latitude values to be {}'.format(self.latitude)) if longitude is None: self.longitude = self.dataset.longitude.values print('sets longitude values to be {}'.format(self.longitude)) else: self.longitude = np.array(longitude) print('sets longitude values to be {}'.format(self.longitude)) if type == 'ar': self.variables = ['t2m', 'q', 'r', 'sp'] #get_list_of_variables_in_ds(self.dataset) else: # traditional model self.variables = [] print('Sets enviornmental variables to {}'.format(self.variables)) self.coeff_matrix = None self.evaluate_ds = None self.mse = None self.r2 = None self.ase = None self.mse_train = None self.r2_train = None self.ase_train = None self.num_test_samples = None self.num_train_samples = None self.transform = transform self.sigmoid = sigmoid # Initialize containers if data should be transformed if self.transform: """ Read transformation from the correct folder in lagringshotellet """ self.bias = False print("Transform is true, forces bias to be false .. ") else: self.bias = bias self.X_train = None self.y_train = None return