def delete_from_file(self, tech=None, stage=None, dataset_name=None, dataset_id=None): """Delete this or related datasets from file Specify arguments relative to this dataset to find datasets which will be deleted. """ # Use existing names as default tech = self.vars["tech"] if tech is None else tech stage = self.vars["stage"] if stage is None else stage dataset_name = self.dataset_name if dataset_name is None else dataset_name if dataset_id is None: dataset_id = self.dataset_id else: dataset_id = _data.parse_dataset_id(self.rundate, tech, stage, dataset_name, dataset_id) dataset_id = {dataset_id} if isinstance(dataset_id, (float, int)) else set(dataset_id) ids_to_delete = dataset_id & set( _data.list_dataset_ids(self.rundate, tech, dataset_name, stage, dataset_name)) if not ids_to_delete: return # Open JSON and HDF5 file and remove datasets file_vars = dict(self.vars, tech=tech, stage=stage) json_path = files.path("dataset_json", file_vars=file_vars) with files.open_path(json_path, mode="rt", write_log=False) as f_json: json_all = json.load(f_json) with files.open_datafile("dataset_hdf5", file_vars=file_vars, mode="a", write_log=False) as f_hdf5: for id_to_delete in ids_to_delete: name = "{name}/{id:04d}".format(name=dataset_name, id=id_to_delete) del json_all[name] del f_hdf5[name] log.debug( "Deleted {name} from dataset {tech}-{stage} at {directory}", name=name, tech=tech, stage=stage, directory=json_path.parent, ) with files.open_path(json_path, mode="wt", write_log=False) as f_json: json.dump(json_all, f_json) # Delete files if all datasets are deleted if not any(["/" in k for k in json_all.keys()]): json_path.unlink() files.path("dataset_hdf5", file_vars=file_vars).unlink()
def write(self, write_level=None): """Write a dataset to file A dataset is stored on disk in two files, one JSON-file and one HDF5-file. Typically the HDF5-file is great for handling numeric data, while JSON is more flexible. The actual writing of the data is handled by the individual datatype table-classes. These classes are free to choose how they divide the data between the JSON- and HDF5-files, as long as they are able to recover all the data. """ json_path = files.path("dataset_json", file_vars=self.vars) log.debug(f"Write dataset {self.vars['tech']}-{self.vars['stage']} to disk at {json_path.parent}") # Read write level from config write_level = config.tech.get("write_level", value=write_level).as_enum("write_level").name # Read existing data in JSON-file try: with files.open_path(json_path, mode="rt", write_log=False) as f_json: json_all = json.load(f_json) except FileNotFoundError: json_all = dict() json_all.setdefault(self.name, dict()) json_data = json_all[self.name] # Figure out which tables have data tables = [t for t in self._data.values() if t.get_fields(write_level)] # Open HDF5-file with files.open_datafile("dataset_hdf5", file_vars=self.vars, mode="a", write_log=False) as f_hdf5: if self.name in f_hdf5: del f_hdf5[self.name] hdf5_data = f_hdf5.create_group(self.name) # Write data for each table (HDF5-data are automatically written to disk) for table in tables: table.write(json_data, hdf5_data, write_level) # Store metadata in JSON-data json_data["_version"] = where.__version__ json_data["_num_obs"] = self.num_obs json_data["_tables"] = {tbl.name: tbl.datatype for tbl in tables} json_data["_units"] = {tbl.name: tbl._units for tbl in tables} json_data["_write_levels"] = {tbl.name: tbl._write_level_strings for tbl in tables} json_data["_meta"] = self.meta json_data["_vars"] = self.vars # Store last dataset_id written to json_all.setdefault(self.dataset_name, dict())["_last_dataset_id"] = self.dataset_id # Write JSON-data to file with files.open_path(json_path, mode="wt", write_log=False) as f_json: json.dump(json_all, f_json)
def read(self): """Read a dataset from file A dataset is stored on disk in two files, one JSON-file and one HDF5-file. Typically the HDF5-file is great for handling numeric data, while JSON is more flexible. The actual reading of the data is handled by the individual datatype table-classes. The dispatch to the correct class is done by functions defined in the :func:`Dataset._read`-method which is called by :mod:`where.data._data` when Dataset is first imported. """ # Open and read JSON-file json_path = files.path("dataset_json", file_vars=self.vars) with files.open_path(json_path, mode="rt", write_log=False) as f_json: json_all = json.load(f_json) if self.name not in json_all: raise FileNotFoundError("Dataset {} not found in file {}".format(self.name, json_path)) log.debug(f"Read dataset {self.vars['tech']}-{self.vars['stage']} from disk at {json_path.parent}") json_data = json_all[self.name] self._num_obs = json_data["_num_obs"] tables = json_data["_tables"] # Open HDF5-file with files.open_datafile("dataset_hdf5", file_vars=self.vars, mode="r", write_log=False) as f_hdf5: hdf5_data = f_hdf5[self.name] # Read data for each table by dispatching to read function based on datatype for table, dtype in tables.items(): read_func = getattr(self, "_read_" + dtype) read_func(table, json_data, hdf5_data) # Add meta and vars properties self.meta = json_data.get("_meta", dict()) self.vars = json_data.get("_vars", self.vars)
def read_data(self): """Read data from a data file and parse the contents """ # Get chain of parsers parsers_chain = iter(self.setup_parser()) parser = next(parsers_chain) # Pointing to first parser cache = dict(line_num=0) with files.open_path(self.file_path, mode="rt", encoding=self.file_encoding) as fid: # Get iterators for current and next line line_iter, next_line_iter = itertools.tee(fid) next(next_line_iter, None) # Iterate over all file lines including last line by using zip_longest for line, next_line in itertools.zip_longest( line_iter, next_line_iter): cache["line_num"] += 1 self.parse_line(line.rstrip(), cache, parser) # Skip to next parser if next_line is None or parser.end_marker( line.rstrip(), cache["line_num"], next_line): if parser.end_callback is not None: parser.end_callback(cache) cache = dict(line_num=0) try: parser = next(parsers_chain) except StopIteration: break
def read_data(self): """Parse the vgosdb wrapper file self.data will be populated with information from the netcdf files """ with files.open_path(self.file_path, mode="rt") as fid: self._parse_file(fid) self._organize_data()
def read_data(self): """Read data from datafiles This is a basic implementation that parses data from one file as specified by the file_key and vars properties. For more advanced uses, e.g. reading data from several files, this method should be overridden. In that case, make sure file dependencies are appended to the self.dependencies-list. """ self.dependencies.append(self.file_path) is_zipped = files.is_path_zipped(self.file_path) if files.empty_file(self.file_path): log.warn(f"File {self.file_path} is empty.") with files.open_path(self.file_path, mode="rt", is_zipped=is_zipped) as fid: self.parse_file(fid)
def _get_sp3_file_version(file_path): """ Get SP3 file version for a given file path Args: file_path (str): SP3 file path Returns: str: SP3 file version number """ with files.open_path(file_path, mode="rt") as infile: version = infile.readline().split()[0] if len(version) < 2 or version[1] not in "acd": log.fatal( f"Unknown SP3 format {version!r} is used in file {file_path}") return version[1]
def read_data(self): """Read data from a Sinex file and parse the contents First the whole Sinex file is read and the requested blocks are stored in self._sinex. After the file has been read, a parser is called on each block so that self.data is properly populated. """ # Read raw sinex data to self._sinex from file with files.open_path(self.file_path, mode="rb") as fid: if self._header: self.parse_header_line(next(fid)) # Header must be first line self.parse_blocks(fid) # Apply parsers to raw sinex data, the information returned by parsers is stored in self.data for sinex_block in self.sinex_blocks: if sinex_block.parser and sinex_block.marker in self._sinex: params = self._sinex.get("__params__", dict()).get(sinex_block.marker, ()) data = sinex_block.parser(self._sinex.get(sinex_block.marker), *params) if data is not None: self.data[sinex_block.marker] = data
def read_data(self): with files.open_path(self.file_path, mode="rt") as fid: self._parse_file(fid)