def _load_combined_attributes(self): """This function loads data set specific attributes and combines them with hydroatlas attributes""" dfs = [] # load dataset specific attributes from the subclass df = self._load_attributes() if df is not None: # in case of training (not finetuning) check for NaNs in feature std. if self._compute_scaler: utils.attributes_sanity_check(df=df) dfs.append(df) # Hydroatlas attributes can be used everywhere if self.cfg.hydroatlas_attributes: dfs.append(self._load_hydroatlas_attributes()) if dfs: # combine all attributes into a single dataframe df = pd.concat(dfs, axis=1) # check if any attribute specified in the config is not available in the dataframes combined_attributes = self.cfg.camels_attributes + self.cfg.hydroatlas_attributes missing_columns = [ attr for attr in combined_attributes if attr not in df.columns ] if missing_columns: raise ValueError( f"The following attributes are not available in the dataset: {missing_columns}" ) # fix the order of the columns to be alphabetically df = df.sort_index(axis=1) # calculate statistics and normalize features if self._compute_scaler: self.scaler["attribute_means"] = df.mean() self.scaler["attribute_stds"] = df.std() if any([k.startswith("camels_attr") for k in self.scaler.keys()]): LOGGER.warning( "Deprecation warning: Using old scaler files won't be supported in the upcoming release." ) # Here we assume that only camels attributes are used df = (df - self.scaler['camels_attr_means'] ) / self.scaler["camels_attr_stds"] else: df = (df - self.scaler['attribute_means'] ) / self.scaler["attribute_stds"] # preprocess each basin feature vector as pytorch tensor for basin in self.basins: attributes = df.loc[df.index == basin].values.flatten() self.attributes[basin] = torch.from_numpy( attributes.astype(np.float32))
def _load_hydroatlas_attributes(self): df = utils.load_hydroatlas_attributes(self.cfg.data_dir, basins=self.basins) # remove all attributes not defined in the config drop_cols = [c for c in df.columns if c not in self.cfg.hydroatlas_attributes] df = df.drop(drop_cols, axis=1) if self.is_train: # sanity check attributes for NaN in per-feature standard deviation utils.attributes_sanity_check(df=df) return df