def validate_internal_dframe(obs_df: pd.DataFrame) -> bool: """Validate the internal dataframe format for observations. Will log warnings and/or errors if anything found. Args: obs_df: Dataframe to validate Returns: True if everything is ok (or empty) """ failed = False if obs_df.empty: logger.warning("Observation dataframe empty") return True if "CLASS" not in obs_df: logger.error("CLASS is not in dataframe - not valid") failed = True if "LABEL" not in obs_df: logger.error("LABEL is not in dataframe - not valid") failed = True non_supported_classes = set(obs_df["CLASS"]) - set(CLASS_SHORTNAME.keys()) if non_supported_classes: logger.error("Unsupported observation classes: %s", str(non_supported_classes)) failed = True index = {"CLASS", "LABEL", "OBS", "SEGMENT"}.intersection(set(obs_df.columns)) repeated_rows = obs_df[obs_df.set_index( list(index)).index.duplicated(keep=False)] if not repeated_rows.empty: logger.error("Non-unique observation classes and labels") logger.error("\n%s", str(repeated_rows.dropna(axis="columns", how="all"))) failed = True # Possibilities for further validation: # * Check that segment has start and end if not default. # * SUMMARY_OBSERVATION requires four arguments (also for resinsight?) # * BLOCK_OBSERVATIONk requires two global, and j, k, value, error for # each subunit. # * block requires label # * general requires data, restart, obs_file. index_list, index_file, # * error_covariance is optional. logger.info("Observation dataframe validated") return not failed
def autoparse_file(filename): """Detects the observation file format for a given filename. This is done by attempting to parse its content and giving up on exceptions. NB: In case of ERT file formats, the include statements are interpreted relative to current working directory. Thus it is recommended to reparse with correct cwd after detecting ERT file format. The correct cwd for include-statement is the path of the ERT config file, which is outside the context of fmuobs. Args: filename (str) Returns: tuple: First element is a string in [resinsight, csv, yaml, ert], second element is a dataframe or a dict (if input was yaml). """ try: dframe = pd.read_csv(filename, sep=";") if {"DATE", "VECTOR", "VALUE", "ERROR"}.issubset( set(dframe.columns) ) and not dframe.empty: logger.info("Parsed %s as a ResInsight observation file", filename) return ("resinsight", resinsight_df2df(dframe)) except ValueError: pass try: dframe = pd.read_csv(filename, sep=",") if {"CLASS", "LABEL"}.issubset(dframe.columns) and not dframe.empty: logger.info( "Parsed %s as a CSV (internal dataframe format for ertobs) file", filename, ) if "DATE" in dframe: dframe["DATE"] = pd.to_datetime(dframe["DATE"]) return ("csv", dframe) except ValueError: pass try: with open(filename) as f_handle: obsdict = yaml.safe_load(f_handle.read()) if isinstance(obsdict, dict): if obsdict.get("smry", None) or obsdict.get("rft", None): logger.info("Parsed %s as a YAML file with observations", filename) return ("yaml", obsdict2df(obsdict)) except yaml.scanner.ScannerError as exception: # This occurs if there are tabs in the file, which is not # allowed in a YAML file (but it can be present in ERT observation files) logger.debug("ScannerError while attempting yaml-parsing") logger.debug(str(exception)) except ValueError: pass try: with open(filename) as f_handle: # This function does not have information on include file paths. # Accept a FileNotFoundError while parsing, if we encounter that # it is most likely an ert file, but which needs additional hints # on where include files are located. try: dframe = ertobs2df(f_handle.read()) except FileNotFoundError: logger.info( "Parsed %s as an ERT observation file, with include statements", filename, ) return ("ert", pd.DataFrame()) if {"CLASS", "LABEL"}.issubset(dframe.columns) and not dframe.empty: if set(dframe["CLASS"]).intersection(set(CLASS_SHORTNAME.keys())): logger.info("Parsed %s as an ERT observation file", filename) return ("ert", dframe) except ValueError: pass logger.error( "Unable to parse %s as any supported observation file format", filename ) return (None, pd.DataFrame)