def read(self): """Read a dataset from file A dataset is stored on disk in two files, one JSON-file and one HDF5-file. Typically the HDF5-file is great for handling numeric data, while JSON is more flexible. The actual reading of the data is handled by the individual datatype table-classes. The dispatch to the correct class is done by functions defined in the :func:`Dataset._read`-method which is called by :mod:`where.data._data` when Dataset is first imported. """ # Open and read JSON-file json_path = files.path("dataset_json", file_vars=self.vars) with files.open_path(json_path, mode="rt", write_log=False) as f_json: json_all = json.load(f_json) if self.name not in json_all: raise FileNotFoundError("Dataset {} not found in file {}".format(self.name, json_path)) log.debug(f"Read dataset {self.vars['tech']}-{self.vars['stage']} from disk at {json_path.parent}") json_data = json_all[self.name] self._num_obs = json_data["_num_obs"] tables = json_data["_tables"] # Open HDF5-file with files.open_datafile("dataset_hdf5", file_vars=self.vars, mode="r", write_log=False) as f_hdf5: hdf5_data = f_hdf5[self.name] # Read data for each table by dispatching to read function based on datatype for table, dtype in tables.items(): read_func = getattr(self, "_read_" + dtype) read_func(table, json_data, hdf5_data) # Add meta and vars properties self.meta = json_data.get("_meta", dict()) self.vars = json_data.get("_vars", self.vars)
def delete_from_file(self, tech=None, stage=None, dataset_name=None, dataset_id=None): """Delete this or related datasets from file Specify arguments relative to this dataset to find datasets which will be deleted. """ # Use existing names as default tech = self.vars["tech"] if tech is None else tech stage = self.vars["stage"] if stage is None else stage dataset_name = self.dataset_name if dataset_name is None else dataset_name if dataset_id is None: dataset_id = self.dataset_id else: dataset_id = _data.parse_dataset_id(self.rundate, tech, stage, dataset_name, dataset_id) dataset_id = {dataset_id} if isinstance(dataset_id, (float, int)) else set(dataset_id) ids_to_delete = dataset_id & set( _data.list_dataset_ids(self.rundate, tech, dataset_name, stage, dataset_name)) if not ids_to_delete: return # Open JSON and HDF5 file and remove datasets file_vars = dict(self.vars, tech=tech, stage=stage) json_path = files.path("dataset_json", file_vars=file_vars) with files.open_path(json_path, mode="rt", write_log=False) as f_json: json_all = json.load(f_json) with files.open_datafile("dataset_hdf5", file_vars=file_vars, mode="a", write_log=False) as f_hdf5: for id_to_delete in ids_to_delete: name = "{name}/{id:04d}".format(name=dataset_name, id=id_to_delete) del json_all[name] del f_hdf5[name] log.debug( "Deleted {name} from dataset {tech}-{stage} at {directory}", name=name, tech=tech, stage=stage, directory=json_path.parent, ) with files.open_path(json_path, mode="wt", write_log=False) as f_json: json.dump(json_all, f_json) # Delete files if all datasets are deleted if not any(["/" in k for k in json_all.keys()]): json_path.unlink() files.path("dataset_hdf5", file_vars=file_vars).unlink()
def write(self, write_level=None): """Write a dataset to file A dataset is stored on disk in two files, one JSON-file and one HDF5-file. Typically the HDF5-file is great for handling numeric data, while JSON is more flexible. The actual writing of the data is handled by the individual datatype table-classes. These classes are free to choose how they divide the data between the JSON- and HDF5-files, as long as they are able to recover all the data. """ json_path = files.path("dataset_json", file_vars=self.vars) log.debug(f"Write dataset {self.vars['tech']}-{self.vars['stage']} to disk at {json_path.parent}") # Read write level from config write_level = config.tech.get("write_level", value=write_level).as_enum("write_level").name # Read existing data in JSON-file try: with files.open_path(json_path, mode="rt", write_log=False) as f_json: json_all = json.load(f_json) except FileNotFoundError: json_all = dict() json_all.setdefault(self.name, dict()) json_data = json_all[self.name] # Figure out which tables have data tables = [t for t in self._data.values() if t.get_fields(write_level)] # Open HDF5-file with files.open_datafile("dataset_hdf5", file_vars=self.vars, mode="a", write_log=False) as f_hdf5: if self.name in f_hdf5: del f_hdf5[self.name] hdf5_data = f_hdf5.create_group(self.name) # Write data for each table (HDF5-data are automatically written to disk) for table in tables: table.write(json_data, hdf5_data, write_level) # Store metadata in JSON-data json_data["_version"] = where.__version__ json_data["_num_obs"] = self.num_obs json_data["_tables"] = {tbl.name: tbl.datatype for tbl in tables} json_data["_units"] = {tbl.name: tbl._units for tbl in tables} json_data["_write_levels"] = {tbl.name: tbl._write_level_strings for tbl in tables} json_data["_meta"] = self.meta json_data["_vars"] = self.vars # Store last dataset_id written to json_all.setdefault(self.dataset_name, dict())["_last_dataset_id"] = self.dataset_id # Write JSON-data to file with files.open_path(json_path, mode="wt", write_log=False) as f_json: json.dump(json_all, f_json)