def dump_hdf5(cls, data_file, leave_open=False, with_data=False): should_close = False if isinstance(data_file, basestring) and os.path.exists(data_file): filename = data_file data_file = HDFLockingFile(data_file, "r", retry_count=10, retry_wait=0.5) should_close = True print "HDF5", filename, data_file else: print "HDF5", data_file def dump_item(entry_name): parts = entry_name.split("/") entry = data_file[entry_name] ilevel = len(parts) print "%s%s %s" % (" "*ilevel, parts[-1], entry) if entry.attrs: print "%s [%s]" % (" "*ilevel, ", ".join("%s=%s" % (k, v) for (k, v) in entry.attrs.iteritems())) if with_data and hasattr(entry, "value"): print "%s %s" % (" "*ilevel, entry.value) data_file.visit(dump_item) if should_close and not leave_open: data_file.close() return data_file
def get_data_copy(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: return self._get_data_copy(data_file, data_filter=data_filter) finally: data_file.close()
def get_data_info(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_info = {} max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] res_info["ds_rows"] = cur_idx res_info["ds_size"] = len(ds_time) res_info["file_size"] = os.path.getsize(ds_filename) res_info["file_name"] = ds_filename res_info["vars"] = list(data_file["vars"]) start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) res_info["need_expand"] = self.expand_info.get("need_expand", False) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion res_info["should_decimate"] = should_decimate res_info["need_decimate"] = bool(should_decimate and end_row - start_row > max_rows) res_info["ts_first"] = NTP4Time.from_ntp64(ds_time.value[0].tostring()).to_unix() res_info["ts_last"] = NTP4Time.from_ntp64(ds_time.value[cur_idx - 1].tostring()).to_unix() res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] * 1000, local_time=False) res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] * 1000, local_time=False) res_info["ds_samples"] = cur_idx * self.expand_info["num_steps"] if res_info["need_expand"] else cur_idx res_info["filter_start_row"] = start_row res_info["filter_end_row"] = end_row res_info["filter_max_rows"] = max_rows res_info["filter_ts_first"] = NTP4Time.from_ntp64(ds_time.value[start_row].tostring()).to_unix() res_info["filter_ts_last"] = NTP4Time.from_ntp64(ds_time.value[end_row - 1].tostring()).to_unix() res_info["filter_ts_first_str"] = get_datetime_str(res_info["filter_ts_first"] * 1000, local_time=False) res_info["filter_ts_last_str"] = get_datetime_str(res_info["filter_ts_last"] * 1000, local_time=False) return res_info finally: data_file.close()
def get_data_copy(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True time_slice = None start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) log.info("ROW INTERVAL %s %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion if self.ds_layout == DS_LAYOUT_INDIVIDUAL: ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] data_array = ds_var[max(start_row, end_row-max_rows, 0):end_row] elif self.ds_layout == DS_LAYOUT_COMBINED: raise NotImplementedError() return res_data finally: data_file.close()
def dump_hdf5(cls, data_file, leave_open=False, with_data=False, with_crow=True): should_close = False if isinstance(data_file, basestring) and os.path.exists(data_file): filename = data_file data_file = HDFLockingFile(data_file, "r", retry_count=10, retry_wait=0.5) should_close = True print "HDF5", filename, data_file else: print "HDF5", data_file def dump_item(entry_name): parts = entry_name.split("/") entry = data_file[entry_name] ilevel = len(parts) cur_row = None print "%s%s %s" % (" " * ilevel, parts[-1], entry) if entry.attrs: print "%s [%s]" % (" " * ilevel, ", ".join("%s=%s" % (k, v) for (k, v) in entry.attrs.iteritems())) if with_data and hasattr(entry, "value"): cur_row = entry.attrs["cur_row"] if entry.attrs and "cur_row" in entry.attrs else None cur_row = cur_row or ( data_file["vars/time"].attrs["cur_row"] if entry_name.startswith("vars") else None ) if with_crow and cur_row: print "%s %s (%s of %s)" % (" " * ilevel, entry.value[:cur_row], cur_row, len(entry)) else: print "%s %s" % (" " * ilevel, entry.value) data_file.visit(dump_item) if should_close and not leave_open: data_file.close() return data_file
def dump_hdf5(cls, data_file, leave_open=False, with_data=False, with_crow=True): should_close = False if isinstance(data_file, basestring) and os.path.exists(data_file): filename = data_file data_file = HDFLockingFile(data_file, "r", retry_count=10, retry_wait=0.5) should_close = True print "HDF5", filename, data_file else: print "HDF5", data_file def dump_item(entry_name): parts = entry_name.split("/") entry = data_file[entry_name] ilevel = len(parts) cur_row = None print "%s%s %s" % (" " * ilevel, parts[-1], entry) if entry.attrs: print "%s [%s]" % (" " * ilevel, ", ".join( "%s=%s" % (k, v) for (k, v) in entry.attrs.iteritems())) if with_data and hasattr(entry, "value"): cur_row = entry.attrs[ "cur_row"] if entry.attrs and "cur_row" in entry.attrs else None cur_row = cur_row or (data_file["vars/time"].attrs["cur_row"] if entry_name.startswith("vars") else None) if with_crow and cur_row: print "%s %s (%s of %s)" % (" " * ilevel, entry.value[:cur_row], cur_row, len(entry)) else: print "%s %s" % (" " * ilevel, entry.value) data_file.visit(dump_item) if should_close and not leave_open: data_file.close() return data_file
def extend_dataset(self, packet): """ Adds values from a data packet to the dataset and updates indexes and metadata """ ingest_ts = NTP4Time.utcnow() num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, [] ds_filename = self._get_ds_filename() data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5) try: if self.ds_layout == DS_LAYOUT_INDIVIDUAL: # Get index values from time var if self.time_var not in packet.data["cols"]: raise BadRequest("Packet has no time") var_ds = data_file["vars/%s" % self.time_var] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] var_ds.attrs["cur_row"] += num_rows # Fill variables with values from packet or NaN for var_name in self.var_defs_map.keys(): var_ds = data_file["vars/%s" % var_name] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) if var_name in packet.data["cols"]: data_slice = packet.data["data"][:][var_name] var_ds[cur_idx:cur_idx+num_rows] = data_slice else: # Leave the initial fill value (zeros) #var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows pass extra_vars = set(packet.data["cols"]) - set(self.var_defs_map.keys()) if extra_vars: log.warn("Data packet had extra vars not in dataset: %s", extra_vars) elif self.ds_layout == DS_LAYOUT_COMBINED: var_ds = data_file["vars/%s" % DS_VARIABLES] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) ds_var_names = [var_info["name"] for var_info in self.var_defs] pvi = {col_name: col_idx for col_idx, col_name in enumerate(packet.data["cols"]) if col_name in ds_var_names} for row_idx in xrange(num_rows): row_data = packet.data["data"][row_idx] row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names) var_ds[cur_idx+row_idx] = row_vals var_ds.attrs["cur_row"] += num_rows # Update time_ingest (ts, begin row, count) ds_tingest = data_file[DS_TIMEINGEST_PATH] if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest): self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT) ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows) ds_tingest.attrs["cur_row"] += 1 # Update time_idx (every nth row's time) new_idx_row = (cur_idx + num_rows + self.time_idx_step - 1) / self.time_idx_step old_idx_row = (cur_idx + self.time_idx_step - 1) / self.time_idx_step num_tidx_rows = new_idx_row - old_idx_row time_ds = data_file["vars/%s" % (self.time_var if self.ds_layout == DS_LAYOUT_INDIVIDUAL else DS_VARIABLES)] time_idx_rows = [time_ds[idx_row*self.time_idx_step] for idx_row in xrange(old_idx_row, new_idx_row)] if time_idx_rows: ds_tidx = data_file[DS_TIMEIDX_PATH] tidx_cur_row = ds_tidx.attrs["cur_row"] if tidx_cur_row + num_tidx_rows > len(ds_tidx): self._resize_dataset(ds_tidx, num_tidx_rows, INTERNAL_ROW_INCREMENT) ds_tidx[tidx_cur_row:tidx_cur_row+num_tidx_rows] = time_idx_rows ds_tidx.attrs["cur_row"] += num_tidx_rows #HDF5Tools.dump_hdf5(data_file, with_data=True) finally: data_file.close()
def get_data_info(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_info = {} max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] res_info["ds_rows"] = cur_idx res_info["ds_size"] = len(ds_time) res_info["file_size"] = os.path.getsize(ds_filename) res_info["file_name"] = ds_filename res_info["vars"] = list(data_file["vars"]) start_row, end_row = self._get_row_interval( data_file, start_time, end_time, start_time_include) res_info["need_expand"] = self.expand_info.get( "need_expand", False) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info[ "num_steps"] # Compensate expansion res_info["should_decimate"] = should_decimate res_info["need_decimate"] = bool( should_decimate and end_row - start_row > max_rows) res_info["ts_first"] = NTP4Time.from_ntp64( ds_time.value[0].tostring()).to_unix() res_info["ts_last"] = NTP4Time.from_ntp64( ds_time.value[cur_idx - 1].tostring()).to_unix() res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] * 1000, local_time=False) res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] * 1000, local_time=False) res_info["ds_samples"] = cur_idx * self.expand_info[ "num_steps"] if res_info["need_expand"] else cur_idx res_info["filter_start_row"] = start_row res_info["filter_end_row"] = end_row res_info["filter_max_rows"] = max_rows res_info["filter_ts_first"] = NTP4Time.from_ntp64( ds_time.value[start_row].tostring()).to_unix() res_info["filter_ts_last"] = NTP4Time.from_ntp64( ds_time.value[end_row - 1].tostring()).to_unix() res_info["filter_ts_first_str"] = get_datetime_str( res_info["filter_ts_first"] * 1000, local_time=False) res_info["filter_ts_last_str"] = get_datetime_str( res_info["filter_ts_last"] * 1000, local_time=False) return res_info finally: data_file.close()
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [ var_info["name"] for var_info in self.var_defs ] time_format = data_filter.get("time_format", "unix_millis") max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True time_slice = None start_row, end_row = self._get_row_interval( data_file, start_time, end_time, start_time_include) log.info("Get data for row interval %s to %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info[ "num_steps"] # Compensate expansion if end_row - start_row > max_rows: if should_decimate: log.info("Decimating %s rows to satisfy %s max rows", end_row - start_row, max_rows_org) else: log.info( "Truncating %s rows to %s max rows, %s unexpanded", end_row - start_row, max_rows_org, max_rows) if self.ds_layout != DS_LAYOUT_INDIVIDUAL: raise NotImplementedError() ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] start_row_act = start_row if should_decimate else max( start_row, end_row - max_rows, 0) data_array = ds_var[start_row_act:end_row] if var_name == self.time_var and self.var_defs_map[ var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [ int(1000 * NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array ] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps self._expand_packed_rows(res_data, data_filter) self._decimate_rows(res_data, data_filter) if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [ (tv, dv) for (tv, dv) in zip(time_series, var_series) ] return res_data finally: data_file.close()
def require_dataset(self, ds_filename=None): """ Ensures a dataset HDF5 file exists and creates it if necessary usign the dataset schema definition. """ ds_filename = ds_filename or self._get_ds_filename() if os.path.exists(ds_filename): return ds_filename, False log.info("Creating new HDF5 file for dataset_id=%s, file='%s'", self.dataset_id, ds_filename) dir_path = os.path.split(ds_filename)[0] try: if not os.path.exists(dir_path): os.makedirs(dir_path) except OSError as exc: import errno if exc.errno == errno.EEXIST and os.path.isdir(dir_path): pass else: raise data_file = HDFLockingFile(ds_filename, "w", retry_count=10, retry_wait=0.5) try: data_file.attrs["dataset_id"] = self.dataset_id data_file.attrs["layout"] = self.ds_layout data_file.attrs["format"] = "scion_hdf5_v1" data_file.create_group("vars") initial_shape = (self.ds_increment, ) if self.ds_layout == DS_LAYOUT_INDIVIDUAL: # Individial layout means every variable has its own table - variable values # must be coindexed with the time values. # The time variable keeps the cur_row attribute which is the next writable index. # The length of tables is increased in configurable chunk sizes. for position, var_info in enumerate(self.var_defs): var_name = var_info["name"] base_type = var_info.get("base_type", "float") dtype = var_info.get("storage_dtype", "f8") dset = data_file.create_dataset("vars/%s" % var_name, initial_shape, dtype=dtype, maxshape=(None, )) dset.attrs["base_type"] = str(base_type) dset.attrs["position"] = position dset.attrs["description"] = str( var_info.get("description", "") or "") dset.attrs["unit"] = str(var_info.get("unit", "") or "") if var_name == self.time_var: dset.attrs["cur_row"] = 0 elif self.ds_layout == DS_LAYOUT_COMBINED: # EXPERIMENTAL - unsupported for most operations. # Combined layout means all variables are in one table of structured type. # The cur_row attribute keeps the number of next writable index. # The length of the table is increased in configurable chunk sizes. dtype_parts = [] for var_info in self.var_defs: var_name = var_info["name"] base_type = var_info.get("base_type", "float") dtype = var_info.get("storage_dtype", "f8") dtype_parts.append((var_name, dtype)) dset = data_file.create_dataset("vars/%s" % DS_VARIABLES, initial_shape, dtype=np.dtype(dtype_parts), maxshape=(None, )) dset.attrs["dtype_repr"] = repr(dset.dtype)[6:-1] dset.attrs["cur_row"] = 0 # Internal time index - a table indexing every nth row's timestep. # Index table grows in constant defined chunk size. data_file.create_group("index") dtype_tidx = [("time", "i8")] ds_tidx = data_file.create_dataset(DS_TIMEIDX_PATH, (INTERNAL_ROW_INCREMENT, ), dtype=dtype_tidx, maxshape=(None, )) ds_tidx.attrs["cur_row"] = 0 ds_tidx.attrs[ "description"] = "Index of every %s-th time value" % self.time_idx_step ds_tidx.attrs["step"] = self.time_idx_step # Internal ingest time - table of 3 tuple with timestamp, start row and num rows for a packet # Index table grows in constant defined chunk size. dtype_tingest = [("time", "i8"), ("row", "u4"), ("count", "u4")] ds_tingest = data_file.create_dataset(DS_TIMEINGEST_PATH, (INTERNAL_ROW_INCREMENT, ), dtype=dtype_tingest, maxshape=(None, )) ds_tingest.attrs["cur_row"] = 0 ds_tingest.attrs["description"] = "Maintains ingest times" finally: data_file.close() return ds_filename, True
def extend_dataset(self, packet): """ Adds values from a data packet to the dataset and updates indexes and metadata """ ingest_ts = NTP4Time.utcnow() num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, [] ds_filename = self._get_ds_filename() data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5) file_closed = False try: if self.ds_layout == DS_LAYOUT_INDIVIDUAL: # Get index values from time var if self.time_var not in packet.data["cols"]: raise BadRequest("Packet has no time") var_ds = data_file["vars/%s" % self.time_var] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] var_ds.attrs["cur_row"] += num_rows # Fill variables with values from packet or NaN for var_name in self.var_defs_map.keys(): var_ds = data_file["vars/%s" % var_name] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) if var_name in packet.data["cols"]: data_slice = packet.data["data"][:][var_name] var_ds[cur_idx:cur_idx + num_rows] = data_slice else: # Leave the initial fill value (zeros) #var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows pass extra_vars = set(packet.data["cols"]) - set( self.var_defs_map.keys()) if extra_vars: log.warn("Data packet had extra vars not in dataset: %s", extra_vars) elif self.ds_layout == DS_LAYOUT_COMBINED: var_ds = data_file["vars/%s" % DS_VARIABLES] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) ds_var_names = [var_info["name"] for var_info in self.var_defs] pvi = { col_name: col_idx for col_idx, col_name in enumerate(packet.data["cols"]) if col_name in ds_var_names } for row_idx in xrange(num_rows): row_data = packet.data["data"][row_idx] row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names) var_ds[cur_idx + row_idx] = row_vals var_ds.attrs["cur_row"] += num_rows # Update time_ingest (ts, begin row, count) ds_tingest = data_file[DS_TIMEINGEST_PATH] if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest): self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT) ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows) ds_tingest.attrs["cur_row"] += 1 # Update time index self._update_time_index(data_file, num_rows, cur_idx=cur_idx) # Check if pruning is necessary if self.prune_trigger_mode == "on_ingest" and self.prune_mode: file_closed = self._prune_dataset(data_file) #HDF5Tools.dump_hdf5(data_file, with_data=True) except Exception: log.exception("Error extending dataset %s HDF5 file" % self.dataset_id) raise finally: if not file_closed: data_file.close()
def require_dataset(self): """ Ensures a dataset HDF5 file exists and creates it if necessary usign the dataset schema definition. """ ds_filename = self._get_ds_filename() if os.path.exists(ds_filename): return ds_filename, False log.info("Creating new HDF5 file for dataset_id=%s, file='%s'", self.dataset_id, ds_filename) dir_path = os.path.split(ds_filename)[0] try: if not os.path.exists(dir_path): os.makedirs(dir_path) except OSError as exc: import errno if exc.errno == errno.EEXIST and os.path.isdir(dir_path): pass else: raise data_file = HDFLockingFile(ds_filename, "w", retry_count=10, retry_wait=0.5) try: data_file.attrs["dataset_id"] = self.dataset_id data_file.attrs["layout"] = self.ds_layout data_file.attrs["format"] = "scion_hdf5_v1" data_file.create_group("vars") initial_shape = (self.ds_increment, ) if self.ds_layout == DS_LAYOUT_INDIVIDUAL: for position, var_info in enumerate(self.var_defs): var_name = var_info["name"] base_type = var_info.get("base_type", "float") dtype = var_info.get("storage_dtype", "f8") dset = data_file.create_dataset("vars/%s" % var_name, initial_shape, dtype=dtype, maxshape=(None, )) dset.attrs["base_type"] = str(base_type) dset.attrs["position"] = position dset.attrs["description"] = str(var_info.get("description", "") or "") dset.attrs["unit"] = str(var_info.get("unit", "") or "") if var_name == self.time_var: dset.attrs["cur_row"] = 0 elif self.ds_layout == DS_LAYOUT_COMBINED: dtype_parts = [] for var_info in self.var_defs: var_name = var_info["name"] base_type = var_info.get("base_type", "float") dtype = var_info.get("storage_dtype", "f8") dtype_parts.append((var_name, dtype)) dset = data_file.create_dataset("vars/%s" % DS_VARIABLES, initial_shape, dtype=np.dtype(dtype_parts), maxshape=(None, )) dset.attrs["dtype_repr"] = repr(dset.dtype)[6:-1] dset.attrs["cur_row"] = 0 # Internal time index data_file.create_group("index") dtype_tidx = [("time", "i8")] ds_tidx = data_file.create_dataset(DS_TIMEIDX_PATH, (INTERNAL_ROW_INCREMENT, ), dtype=dtype_tidx, maxshape=(None, )) ds_tidx.attrs["cur_row"] = 0 ds_tidx.attrs["description"] = "Index of every %s-th time value" % self.time_idx_step ds_tidx.attrs["step"] = self.time_idx_step # Internal ingest time dtype_tingest = [("time", "i8"), ("row", "u4"), ("count", "u4")] ds_tingest = data_file.create_dataset(DS_TIMEINGEST_PATH, (INTERNAL_ROW_INCREMENT, ), dtype=dtype_tingest, maxshape=(None, )) ds_tingest.attrs["cur_row"] = 0 ds_tingest.attrs["description"] = "Maintains ingest times" finally: data_file.close() return ds_filename, True
def test_hdf5_persist(self): # Test HDF5 writing, time indexing, array extension etc ds_schema_str = """ type: scion_data_schema_1 description: Schema for test datasets attributes: basic_shape: 1d_timeseries time_variable: time persistence: format: hdf5 layout: vars_individual row_increment: 1000 time_index_step: 1000 variables: - name: time base_type: ntp_time storage_dtype: i8 unit: "" description: NTPv4 timestamp - name: var1 base_type: float storage_dtype: f8 unit: "" description: Sample value - name: random1 base_type: float storage_dtype: f8 unit: "" description: Random values """ ds_schema = yaml.load(ds_schema_str) ds_id = create_simple_unique_id() ds_filename = self.container.file_system.get( "%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id)) self.hdf5_persist = DatasetHDF5Persistence.get_persistence( ds_id, ds_schema, "hdf5") self.hdf5_persist.require_dataset() self.assertTrue(os.path.exists(ds_filename)) self.addCleanup(os.remove, ds_filename) # Add 100 values in packets of 10 for i in xrange(10): packet = self._get_data_packet(i * 10, 10) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res), 3) self.assertEqual(len(data_res["time"]), 100) self.assertEqual(len(data_res["var1"]), 100) self.assertEqual(len(data_res["random1"]), 100) self.assertEqual(data_res["var1"][1], 1.0) with HDFLockingFile(ds_filename, "r") as hdff: ds_time = hdff["vars/time"] cur_idx = ds_time.attrs["cur_row"] self.assertEqual(cur_idx, 100) self.assertEqual(len(ds_time), 1000) ds_tidx = hdff[DS_TIMEIDX_PATH] cur_tidx = ds_tidx.attrs["cur_row"] self.assertEqual(cur_tidx, 1) self.assertEqual(len(ds_tidx), 1000) # Add 1000 values in packets of 10 for i in xrange(100): packet = self._get_data_packet(100 + i * 10, 10) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 1100) with HDFLockingFile(ds_filename, "r") as hdff: ds_time = hdff["vars/time"] cur_idx = ds_time.attrs["cur_row"] self.assertEqual(cur_idx, 1100) self.assertEqual(len(ds_time), 2000) ds_tidx = hdff[DS_TIMEIDX_PATH] cur_tidx = ds_tidx.attrs["cur_row"] self.assertEqual(cur_tidx, 2) self.assertEqual(len(ds_tidx), 1000) self.assertEqual(ds_time[0], ds_tidx[0][0]) self.assertEqual(ds_time[1000], ds_tidx[1][0]) info_res = self.hdf5_persist.get_data_info() self.assertEqual(info_res["ds_rows"], 1100) self.assertEqual(info_res["ts_first"], 1000000000.0) self.assertEqual(info_res["ts_last"], 1000010990.0)
def require_dataset(self, ds_filename=None): """ Ensures a dataset HDF5 file exists and creates it if necessary usign the dataset schema definition. """ ds_filename = ds_filename or self._get_ds_filename() if os.path.exists(ds_filename): return ds_filename, False log.info("Creating new HDF5 file for dataset_id=%s, file='%s'", self.dataset_id, ds_filename) dir_path = os.path.split(ds_filename)[0] try: if not os.path.exists(dir_path): os.makedirs(dir_path) except OSError as exc: import errno if exc.errno == errno.EEXIST and os.path.isdir(dir_path): pass else: raise data_file = HDFLockingFile(ds_filename, "w", retry_count=10, retry_wait=0.5) try: data_file.attrs["dataset_id"] = self.dataset_id data_file.attrs["layout"] = self.ds_layout data_file.attrs["format"] = "scion_hdf5_v1" data_file.create_group("vars") initial_shape = (self.ds_increment,) if self.ds_layout == DS_LAYOUT_INDIVIDUAL: # Individial layout means every variable has its own table - variable values # must be coindexed with the time values. # The time variable keeps the cur_row attribute which is the next writable index. # The length of tables is increased in configurable chunk sizes. for position, var_info in enumerate(self.var_defs): var_name = var_info["name"] base_type = var_info.get("base_type", "float") dtype = var_info.get("storage_dtype", "f8") dset = data_file.create_dataset("vars/%s" % var_name, initial_shape, dtype=dtype, maxshape=(None,)) dset.attrs["base_type"] = str(base_type) dset.attrs["position"] = position dset.attrs["description"] = str(var_info.get("description", "") or "") dset.attrs["unit"] = str(var_info.get("unit", "") or "") if var_name == self.time_var: dset.attrs["cur_row"] = 0 elif self.ds_layout == DS_LAYOUT_COMBINED: # EXPERIMENTAL - unsupported for most operations. # Combined layout means all variables are in one table of structured type. # The cur_row attribute keeps the number of next writable index. # The length of the table is increased in configurable chunk sizes. dtype_parts = [] for var_info in self.var_defs: var_name = var_info["name"] base_type = var_info.get("base_type", "float") dtype = var_info.get("storage_dtype", "f8") dtype_parts.append((var_name, dtype)) dset = data_file.create_dataset( "vars/%s" % DS_VARIABLES, initial_shape, dtype=np.dtype(dtype_parts), maxshape=(None,) ) dset.attrs["dtype_repr"] = repr(dset.dtype)[6:-1] dset.attrs["cur_row"] = 0 # Internal time index - a table indexing every nth row's timestep. # Index table grows in constant defined chunk size. data_file.create_group("index") dtype_tidx = [("time", "i8")] ds_tidx = data_file.create_dataset( DS_TIMEIDX_PATH, (INTERNAL_ROW_INCREMENT,), dtype=dtype_tidx, maxshape=(None,) ) ds_tidx.attrs["cur_row"] = 0 ds_tidx.attrs["description"] = "Index of every %s-th time value" % self.time_idx_step ds_tidx.attrs["step"] = self.time_idx_step # Internal ingest time - table of 3 tuple with timestamp, start row and num rows for a packet # Index table grows in constant defined chunk size. dtype_tingest = [("time", "i8"), ("row", "u4"), ("count", "u4")] ds_tingest = data_file.create_dataset( DS_TIMEINGEST_PATH, (INTERNAL_ROW_INCREMENT,), dtype=dtype_tingest, maxshape=(None,) ) ds_tingest.attrs["cur_row"] = 0 ds_tingest.attrs["description"] = "Maintains ingest times" finally: data_file.close() return ds_filename, True
def extend_dataset(self, packet): """ Adds values from a data packet to the dataset and updates indexes and metadata """ ingest_ts = NTP4Time.utcnow() num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, [] ds_filename = self._get_ds_filename() data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5) file_closed = False try: if self.ds_layout == DS_LAYOUT_INDIVIDUAL: # Get index values from time var if self.time_var not in packet.data["cols"]: raise BadRequest("Packet has no time") var_ds = data_file["vars/%s" % self.time_var] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] var_ds.attrs["cur_row"] += num_rows # Fill variables with values from packet or NaN for var_name in self.var_defs_map.keys(): var_ds = data_file["vars/%s" % var_name] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) if var_name in packet.data["cols"]: data_slice = packet.data["data"][:][var_name] var_ds[cur_idx : cur_idx + num_rows] = data_slice else: # Leave the initial fill value (zeros) # var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows pass extra_vars = set(packet.data["cols"]) - set(self.var_defs_map.keys()) if extra_vars: log.warn("Data packet had extra vars not in dataset: %s", extra_vars) elif self.ds_layout == DS_LAYOUT_COMBINED: var_ds = data_file["vars/%s" % DS_VARIABLES] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) ds_var_names = [var_info["name"] for var_info in self.var_defs] pvi = { col_name: col_idx for col_idx, col_name in enumerate(packet.data["cols"]) if col_name in ds_var_names } for row_idx in xrange(num_rows): row_data = packet.data["data"][row_idx] row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names) var_ds[cur_idx + row_idx] = row_vals var_ds.attrs["cur_row"] += num_rows # Update time_ingest (ts, begin row, count) ds_tingest = data_file[DS_TIMEINGEST_PATH] if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest): self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT) ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows) ds_tingest.attrs["cur_row"] += 1 # Update time index self._update_time_index(data_file, num_rows, cur_idx=cur_idx) # Check if pruning is necessary if self.prune_trigger_mode == "on_ingest" and self.prune_mode: file_closed = self._prune_dataset(data_file) # HDF5Tools.dump_hdf5(data_file, with_data=True) except Exception: log.exception("Error extending dataset %s HDF5 file" % self.dataset_id) raise finally: if not file_closed: data_file.close()
def _get_data_copy(self, data_file, data_filter=None): """ Helper to copy HDF5 that takes already open file handle """ data_filter = data_filter or {} res_data = {} read_vars = data_filter.get("variables", []) or [ var_info["name"] for var_info in self.var_defs ] start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True time_slice = None ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) num_rows = end_row - start_row log.info("Copying dataset: %s rows of %s (%s to %s)", end_row - start_row, cur_idx, start_row, end_row) if self.ds_layout != DS_LAYOUT_INDIVIDUAL: raise NotImplementedError() copy_filename = self.container.file_system.get("TEMP/ds_temp_%s.hdf5" % uuid.uuid4().hex) try: self.require_dataset(ds_filename=copy_filename) new_file = HDFLockingFile(copy_filename, "r+", retry_count=2, retry_wait=0.1) try: for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] new_ds_var = new_file[ds_path] if num_rows > len(new_ds_var): self._resize_dataset(new_ds_var, num_rows) data_array = ds_var[start_row:end_row] # TODO: Chunkwise copy instead of one big new_ds_var[0:num_rows] = data_array if var_name == self.time_var: new_ds_var.attrs["cur_row"] = num_rows # Use lower level copy # Time index self._update_time_index(new_file, num_rows, cur_idx=0) # Ingest ts - copy from existing, fix index values and prune ds_tingest = data_file[DS_TIMEINGEST_PATH] new_ds_tingest = new_file[DS_TIMEINGEST_PATH] self._resize_dataset( new_ds_tingest, len(ds_tingest), INTERNAL_ROW_INCREMENT) # Could be smaller prev_irow_val, cur_new_row = None, 0 for irow, irow_val in enumerate( ds_tingest[:ds_tingest.attrs["cur_row"]]): its, iidx, inrows = irow_val if iidx >= start_row: if iidx > start_row and cur_new_row == 0 and prev_irow_val: # The first is partial of previous row pits, piidx, pinrows = prev_irow_val new_ds_tingest[cur_new_row] = (pits, 0, iidx - start_row) cur_new_row += 1 if iidx + inrows - 1 < end_row: new_ds_tingest[cur_new_row] = (its, iidx - start_row, min( inrows, end_row - iidx)) cur_new_row += 1 prev_irow_val = irow_val new_ds_tingest.attrs["cur_row"] = cur_new_row finally: try: new_file.close() except Exception: pass log.info( "Copy dataset successful: %s rows, %s bytes (original size: %s bytes)", num_rows, os.path.getsize(copy_filename), os.path.getsize(self._get_ds_filename())) #HDF5Tools.dump_hdf5(copy_filename, with_data=True) except Exception: log.exception("Error copying HDF5 file") if os.path.exists(copy_filename): os.remove(copy_filename) return None return copy_filename
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] time_format = data_filter.get("time_format", "unix_millis") max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True time_slice = None start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) log.info("Get data for row interval %s to %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion if end_row - start_row > max_rows: if should_decimate: log.info("Decimating %s rows to satisfy %s max rows", end_row - start_row, max_rows_org) else: log.info( "Truncating %s rows to %s max rows, %s unexpanded", end_row - start_row, max_rows_org, max_rows ) if self.ds_layout != DS_LAYOUT_INDIVIDUAL: raise NotImplementedError() ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] start_row_act = start_row if should_decimate else max(start_row, end_row - max_rows, 0) data_array = ds_var[start_row_act:end_row] if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [int(1000 * NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps self._expand_packed_rows(res_data, data_filter) self._decimate_rows(res_data, data_filter) if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)] return res_data finally: data_file.close()
def _get_data_copy(self, data_file, data_filter=None): """ Helper to copy HDF5 that takes already open file handle """ data_filter = data_filter or {} res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True time_slice = None ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) num_rows = end_row - start_row log.info("Copying dataset: %s rows of %s (%s to %s)", end_row - start_row, cur_idx, start_row, end_row) if self.ds_layout != DS_LAYOUT_INDIVIDUAL: raise NotImplementedError() copy_filename = self.container.file_system.get("TEMP/ds_temp_%s.hdf5" % uuid.uuid4().hex) try: self.require_dataset(ds_filename=copy_filename) new_file = HDFLockingFile(copy_filename, "r+", retry_count=2, retry_wait=0.1) try: for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] new_ds_var = new_file[ds_path] if num_rows > len(new_ds_var): self._resize_dataset(new_ds_var, num_rows) data_array = ds_var[start_row:end_row] # TODO: Chunkwise copy instead of one big new_ds_var[0:num_rows] = data_array if var_name == self.time_var: new_ds_var.attrs["cur_row"] = num_rows # Use lower level copy # Time index self._update_time_index(new_file, num_rows, cur_idx=0) # Ingest ts - copy from existing, fix index values and prune ds_tingest = data_file[DS_TIMEINGEST_PATH] new_ds_tingest = new_file[DS_TIMEINGEST_PATH] self._resize_dataset(new_ds_tingest, len(ds_tingest), INTERNAL_ROW_INCREMENT) # Could be smaller prev_irow_val, cur_new_row = None, 0 for irow, irow_val in enumerate(ds_tingest[: ds_tingest.attrs["cur_row"]]): its, iidx, inrows = irow_val if iidx >= start_row: if iidx > start_row and cur_new_row == 0 and prev_irow_val: # The first is partial of previous row pits, piidx, pinrows = prev_irow_val new_ds_tingest[cur_new_row] = (pits, 0, iidx - start_row) cur_new_row += 1 if iidx + inrows - 1 < end_row: new_ds_tingest[cur_new_row] = (its, iidx - start_row, min(inrows, end_row - iidx)) cur_new_row += 1 prev_irow_val = irow_val new_ds_tingest.attrs["cur_row"] = cur_new_row finally: try: new_file.close() except Exception: pass log.info( "Copy dataset successful: %s rows, %s bytes (original size: %s bytes)", num_rows, os.path.getsize(copy_filename), os.path.getsize(self._get_ds_filename()), ) # HDF5Tools.dump_hdf5(copy_filename, with_data=True) except Exception: log.exception("Error copying HDF5 file") if os.path.exists(copy_filename): os.remove(copy_filename) return None return copy_filename
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] time_format = data_filter.get("time_format", "unix_millis") max_rows = data_filter.get("max_rows", 999999999) time_slice = None if self.ds_layout == DS_LAYOUT_INDIVIDUAL: time_ds = data_file["vars/%s" % self.time_var] cur_idx = time_ds.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue var_ds = data_file[ds_path] data_array = var_ds[max(cur_idx-max_rows, 0):cur_idx] if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)] # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array elif self.ds_layout == DS_LAYOUT_COMBINED: raise NotImplementedError() start_time = data_filter.get("start_time", None) start_time_include = data_filter.get("start_time_include", True) is True if time_slice and res_data and start_time: start_time = int(start_time) time_idx = len(time_slice) for idx, tv in enumerate(time_slice): if tv == start_time and start_time_include: time_idx = idx break elif tv > start_time: time_idx = idx break for var_name, var_series in res_data.iteritems(): res_data[var_name] = var_series[time_idx:] return res_data finally: data_file.close()
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] time_format = data_filter.get("time_format", "unix_millis") max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True time_slice = None start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) log.info("Row date interval: %s : %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion if end_row-start_row > max_rows: log.info("Truncating %s rows to %s max rows (from the end)", end_row-start_row, max_rows) if self.ds_layout == DS_LAYOUT_INDIVIDUAL: ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] data_array = ds_var[max(start_row, end_row-max_rows, 0):end_row] if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array # At this point we have dict with variable to data array mapping with target (unix) timestamps self._expand_packed_rows(res_data, data_filter) if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)] # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array elif self.ds_layout == DS_LAYOUT_COMBINED: raise NotImplementedError() return res_data finally: data_file.close()