Пример #1
0
    def read(self):
        """Read a dataset from file

        A dataset is stored on disk in two files, one JSON-file and one HDF5-file. Typically the HDF5-file is great for
        handling numeric data, while JSON is more flexible. The actual reading of the data is handled by the individual
        datatype table-classes. The dispatch to the correct class is done by functions defined in the
        :func:`Dataset._read`-method which is called by :mod:`where.data._data` when Dataset is first imported.
        """
        # Open and read JSON-file
        json_path = files.path("dataset_json", file_vars=self.vars)
        with files.open_path(json_path, mode="rt", write_log=False) as f_json:
            json_all = json.load(f_json)
        if self.name not in json_all:
            raise FileNotFoundError("Dataset {} not found in file {}".format(self.name, json_path))

        log.debug(f"Read dataset {self.vars['tech']}-{self.vars['stage']} from disk at {json_path.parent}")
        json_data = json_all[self.name]
        self._num_obs = json_data["_num_obs"]
        tables = json_data["_tables"]

        # Open HDF5-file
        with files.open_datafile("dataset_hdf5", file_vars=self.vars, mode="r", write_log=False) as f_hdf5:
            hdf5_data = f_hdf5[self.name]

            # Read data for each table by dispatching to read function based on datatype
            for table, dtype in tables.items():
                read_func = getattr(self, "_read_" + dtype)
                read_func(table, json_data, hdf5_data)

        # Add meta and vars properties
        self.meta = json_data.get("_meta", dict())
        self.vars = json_data.get("_vars", self.vars)
Пример #2
0
    def delete_from_file(self,
                         tech=None,
                         stage=None,
                         dataset_name=None,
                         dataset_id=None):
        """Delete this or related datasets from file

        Specify arguments relative to this dataset to find datasets which will be deleted.
        """
        # Use existing names as default
        tech = self.vars["tech"] if tech is None else tech
        stage = self.vars["stage"] if stage is None else stage
        dataset_name = self.dataset_name if dataset_name is None else dataset_name
        if dataset_id is None:
            dataset_id = self.dataset_id
        else:
            dataset_id = _data.parse_dataset_id(self.rundate, tech, stage,
                                                dataset_name, dataset_id)

        dataset_id = {dataset_id} if isinstance(dataset_id,
                                                (float,
                                                 int)) else set(dataset_id)
        ids_to_delete = dataset_id & set(
            _data.list_dataset_ids(self.rundate, tech, dataset_name, stage,
                                   dataset_name))
        if not ids_to_delete:
            return

        # Open JSON and HDF5 file and remove datasets
        file_vars = dict(self.vars, tech=tech, stage=stage)
        json_path = files.path("dataset_json", file_vars=file_vars)
        with files.open_path(json_path, mode="rt", write_log=False) as f_json:
            json_all = json.load(f_json)
        with files.open_datafile("dataset_hdf5",
                                 file_vars=file_vars,
                                 mode="a",
                                 write_log=False) as f_hdf5:
            for id_to_delete in ids_to_delete:
                name = "{name}/{id:04d}".format(name=dataset_name,
                                                id=id_to_delete)
                del json_all[name]
                del f_hdf5[name]
                log.debug(
                    "Deleted {name} from dataset {tech}-{stage} at {directory}",
                    name=name,
                    tech=tech,
                    stage=stage,
                    directory=json_path.parent,
                )

        with files.open_path(json_path, mode="wt", write_log=False) as f_json:
            json.dump(json_all, f_json)

        # Delete files if all datasets are deleted
        if not any(["/" in k for k in json_all.keys()]):
            json_path.unlink()
            files.path("dataset_hdf5", file_vars=file_vars).unlink()
Пример #3
0
    def write(self, write_level=None):
        """Write a dataset to file

        A dataset is stored on disk in two files, one JSON-file and one HDF5-file. Typically the HDF5-file is great for
        handling numeric data, while JSON is more flexible. The actual writing of the data is handled by the individual
        datatype table-classes. These classes are free to choose how they divide the data between the JSON- and
        HDF5-files, as long as they are able to recover all the data.
        """
        json_path = files.path("dataset_json", file_vars=self.vars)
        log.debug(f"Write dataset {self.vars['tech']}-{self.vars['stage']} to disk at {json_path.parent}")

        # Read write level from config
        write_level = config.tech.get("write_level", value=write_level).as_enum("write_level").name

        # Read existing data in JSON-file
        try:
            with files.open_path(json_path, mode="rt", write_log=False) as f_json:
                json_all = json.load(f_json)
        except FileNotFoundError:
            json_all = dict()
        json_all.setdefault(self.name, dict())
        json_data = json_all[self.name]

        # Figure out which tables have data
        tables = [t for t in self._data.values() if t.get_fields(write_level)]

        # Open HDF5-file
        with files.open_datafile("dataset_hdf5", file_vars=self.vars, mode="a", write_log=False) as f_hdf5:
            if self.name in f_hdf5:
                del f_hdf5[self.name]
            hdf5_data = f_hdf5.create_group(self.name)

            # Write data for each table (HDF5-data are automatically written to disk)
            for table in tables:
                table.write(json_data, hdf5_data, write_level)

        # Store metadata in JSON-data
        json_data["_version"] = where.__version__
        json_data["_num_obs"] = self.num_obs
        json_data["_tables"] = {tbl.name: tbl.datatype for tbl in tables}
        json_data["_units"] = {tbl.name: tbl._units for tbl in tables}
        json_data["_write_levels"] = {tbl.name: tbl._write_level_strings for tbl in tables}
        json_data["_meta"] = self.meta
        json_data["_vars"] = self.vars

        # Store last dataset_id written to
        json_all.setdefault(self.dataset_name, dict())["_last_dataset_id"] = self.dataset_id

        # Write JSON-data to file
        with files.open_path(json_path, mode="wt", write_log=False) as f_json:
            json.dump(json_all, f_json)