Exemplo n.º 1
0
    def _load_combined_attributes(self):
        """This function loads data set specific attributes and combines them with hydroatlas attributes"""
        dfs = []

        # load dataset specific attributes from the subclass
        df = self._load_attributes()

        if df is not None:
            # in case of training (not finetuning) check for NaNs in feature std.
            if self._compute_scaler:
                utils.attributes_sanity_check(df=df)

            dfs.append(df)

        # Hydroatlas attributes can be used everywhere
        if self.cfg.hydroatlas_attributes:
            dfs.append(self._load_hydroatlas_attributes())

        if dfs:
            # combine all attributes into a single dataframe
            df = pd.concat(dfs, axis=1)

            # check if any attribute specified in the config is not available in the dataframes
            combined_attributes = self.cfg.camels_attributes + self.cfg.hydroatlas_attributes
            missing_columns = [
                attr for attr in combined_attributes if attr not in df.columns
            ]
            if missing_columns:
                raise ValueError(
                    f"The following attributes are not available in the dataset: {missing_columns}"
                )

            # fix the order of the columns to be alphabetically
            df = df.sort_index(axis=1)

            # calculate statistics and normalize features
            if self._compute_scaler:
                self.scaler["attribute_means"] = df.mean()
                self.scaler["attribute_stds"] = df.std()

            if any([k.startswith("camels_attr") for k in self.scaler.keys()]):
                LOGGER.warning(
                    "Deprecation warning: Using old scaler files won't be supported in the upcoming release."
                )

                # Here we assume that only camels attributes are used
                df = (df - self.scaler['camels_attr_means']
                      ) / self.scaler["camels_attr_stds"]
            else:
                df = (df - self.scaler['attribute_means']
                      ) / self.scaler["attribute_stds"]

            # preprocess each basin feature vector as pytorch tensor
            for basin in self.basins:
                attributes = df.loc[df.index == basin].values.flatten()
                self.attributes[basin] = torch.from_numpy(
                    attributes.astype(np.float32))
    def _load_hydroatlas_attributes(self):
        df = utils.load_hydroatlas_attributes(self.cfg.data_dir, basins=self.basins)

        # remove all attributes not defined in the config
        drop_cols = [c for c in df.columns if c not in self.cfg.hydroatlas_attributes]
        df = df.drop(drop_cols, axis=1)

        if self.is_train:
            # sanity check attributes for NaN in per-feature standard deviation
            utils.attributes_sanity_check(df=df)

        return df