def test_hdf5_persist_decimate(self): # Test HDF5 writing, time indexing, array extension etc ds_schema_str = """ type: scion_data_schema_1 description: Schema for test datasets attributes: basic_shape: 1d_timeseries time_variable: time persistence: format: hdf5 layout: vars_individual row_increment: 1000 time_index_step: 1000 variables: - name: time base_type: ntp_time storage_dtype: i8 unit: "" description: NTPv4 timestamp - name: var1 base_type: float storage_dtype: f8 unit: "" description: Sample value - name: random1 base_type: float storage_dtype: f8 unit: "" description: Random values """ ds_schema = yaml.load(ds_schema_str) ds_id = create_simple_unique_id() ds_filename = self.container.file_system.get( "%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id)) self.hdf5_persist = DatasetHDF5Persistence.get_persistence( ds_id, ds_schema, "hdf5") self.hdf5_persist.require_dataset() self.assertTrue(os.path.exists(ds_filename)) self.addCleanup(os.remove, ds_filename) # Add 100000 values in packets of 100 for i in xrange(100): packet = self._get_data_packet(i * 100, 100) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res), 3) self.assertEqual(len(data_res["time"]), 10000) data_res = self.hdf5_persist.get_data( dict(max_rows=999, decimate=True, decimate_method="minmax")) self.assertEqual(len(data_res), 3) self.assertLessEqual(len(data_res["time"]), 1000)
def test_hdf5_persist_decimate(self): # Test HDF5 writing, time indexing, array extension etc ds_schema_str = """ type: scion_data_schema_1 description: Schema for test datasets attributes: basic_shape: 1d_timeseries time_variable: time persistence: format: hdf5 layout: vars_individual row_increment: 1000 time_index_step: 1000 variables: - name: time base_type: ntp_time storage_dtype: i8 unit: "" description: NTPv4 timestamp - name: var1 base_type: float storage_dtype: f8 unit: "" description: Sample value - name: random1 base_type: float storage_dtype: f8 unit: "" description: Random values """ ds_schema = yaml.load(ds_schema_str) ds_id = create_simple_unique_id() ds_filename = self.container.file_system.get("%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id)) self.hdf5_persist = DatasetHDF5Persistence.get_persistence(ds_id, ds_schema, "hdf5") self.hdf5_persist.require_dataset() self.assertTrue(os.path.exists(ds_filename)) self.addCleanup(os.remove, ds_filename) # Add 100000 values in packets of 100 for i in xrange(100): packet = self._get_data_packet(i * 100, 100) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res), 3) self.assertEqual(len(data_res["time"]), 10000) data_res = self.hdf5_persist.get_data(dict(max_rows=999, decimate=True, decimate_method="minmax")) self.assertEqual(len(data_res), 3) self.assertLessEqual(len(data_res["time"]), 1000)
def get_asset_data(self, asset_id='', data_format='', data_filter=None): asset_obj = self._validate_resource_id("asset_id", asset_id, RT.Instrument) dataset_objs, _ = self.rr.find_objects(asset_id, PRED.hasDataset, RT.Dataset, id_only=False) if not dataset_objs: raise BadRequest("Could not find dataset") dataset_obj = dataset_objs[0] from ion.data.persist.hdf5_dataset import DatasetHDF5Persistence persistence = DatasetHDF5Persistence(dataset_obj._id, dataset_obj.schema_definition, "hdf5") data_filter1 = dict(transpose_time=True, time_format="unix_millis", max_rows=1000) data_filter1.update(data_filter or {}) data_info = dict(dataset_id=dataset_obj._id, ts_generated=get_ion_ts(), data={}, info={}, num_rows=0) if data_filter1.get("get_info", None) is True: data_info["variables"] = [ var_info["name"] for var_info in dataset_obj.schema_definition["variables"] ] data_info["schema"] = dataset_obj.schema_definition res_info = persistence.get_data_info(data_filter1) data_info["info"].update(res_info) if data_filter1.get("include_data", True): raw_data = persistence.get_data(data_filter=data_filter1) data_info["data"] = raw_data data_info["num_rows"] = len( raw_data.values()[0]) if raw_data else 0 return data_info
def download_asset_data(self, asset_id='', data_format='', data_filter=None): asset_obj = self._validate_resource_id("asset_id", asset_id, RT.Instrument) dataset_objs, _ = self.rr.find_objects(asset_id, PRED.hasDataset, RT.Dataset, id_only=False) if not dataset_objs: raise BadRequest("Could not find dataset") dataset_obj = dataset_objs[0] if data_format and data_format != "hdf5": raise BadRequest("Unsupported download data format") from ion.data.persist.hdf5_dataset import DatasetHDF5Persistence persistence = DatasetHDF5Persistence(dataset_obj._id, dataset_obj.schema_definition, "hdf5") data_filter1 = dict(transpose_time=True, time_format="unix_millis", max_rows=100000, start_time=get_ion_ts_millis() - 86400000) data_filter1.update(data_filter or {}) temp_filename = persistence.get_data_copy(data_filter=data_filter1) resp_hdrs = { "Content-Disposition": 'attachment; filename="ds_%s.hdf5"' % asset_obj._id } mr = MediaResponse(media_mimetype="application/octet-stream", body=temp_filename, internal_encoding="filename", response_headers=resp_hdrs) return mr
def test_hdf5_persist(self): # Test HDF5 writing, time indexing, array extension etc ds_schema_str = """ type: scion_data_schema_1 description: Schema for test datasets attributes: basic_shape: 1d_timeseries time_variable: time persistence: format: hdf5 layout: vars_individual row_increment: 1000 time_index_step: 1000 variables: - name: time base_type: ntp_time storage_dtype: i8 unit: "" description: NTPv4 timestamp - name: var1 base_type: float storage_dtype: f8 unit: "" description: Sample value - name: random1 base_type: float storage_dtype: f8 unit: "" description: Random values """ ds_schema = yaml.load(ds_schema_str) ds_id = create_simple_unique_id() ds_filename = self.container.file_system.get( "%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id)) self.hdf5_persist = DatasetHDF5Persistence.get_persistence( ds_id, ds_schema, "hdf5") self.hdf5_persist.require_dataset() self.assertTrue(os.path.exists(ds_filename)) self.addCleanup(os.remove, ds_filename) # Add 100 values in packets of 10 for i in xrange(10): packet = self._get_data_packet(i * 10, 10) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res), 3) self.assertEqual(len(data_res["time"]), 100) self.assertEqual(len(data_res["var1"]), 100) self.assertEqual(len(data_res["random1"]), 100) self.assertEqual(data_res["var1"][1], 1.0) with HDFLockingFile(ds_filename, "r") as hdff: ds_time = hdff["vars/time"] cur_idx = ds_time.attrs["cur_row"] self.assertEqual(cur_idx, 100) self.assertEqual(len(ds_time), 1000) ds_tidx = hdff[DS_TIMEIDX_PATH] cur_tidx = ds_tidx.attrs["cur_row"] self.assertEqual(cur_tidx, 1) self.assertEqual(len(ds_tidx), 1000) # Add 1000 values in packets of 10 for i in xrange(100): packet = self._get_data_packet(100 + i * 10, 10) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 1100) with HDFLockingFile(ds_filename, "r") as hdff: ds_time = hdff["vars/time"] cur_idx = ds_time.attrs["cur_row"] self.assertEqual(cur_idx, 1100) self.assertEqual(len(ds_time), 2000) ds_tidx = hdff[DS_TIMEIDX_PATH] cur_tidx = ds_tidx.attrs["cur_row"] self.assertEqual(cur_tidx, 2) self.assertEqual(len(ds_tidx), 1000) self.assertEqual(ds_time[0], ds_tidx[0][0]) self.assertEqual(ds_time[1000], ds_tidx[1][0]) info_res = self.hdf5_persist.get_data_info() self.assertEqual(info_res["ds_rows"], 1100) self.assertEqual(info_res["ts_first"], 1000000000.0) self.assertEqual(info_res["ts_last"], 1000010990.0)
def test_hdf5_persist_prune(self): # Test auto-pruning ds_schema_str = """ type: scion_data_schema_1 description: Schema for test datasets attributes: basic_shape: 1d_timeseries time_variable: time persistence: format: hdf5 layout: vars_individual row_increment: 1000 time_index_step: 1000 pruning: trigger_mode: on_ingest prune_mode: max_age_rel prune_action: rewrite trigger_age: 1000.0 retain_age: 500.0 variables: - name: time base_type: ntp_time storage_dtype: i8 unit: "" description: NTPv4 timestamp - name: var1 base_type: float storage_dtype: f8 unit: "" description: Sample value - name: random1 base_type: float storage_dtype: f8 unit: "" description: Random values """ ds_schema = yaml.load(ds_schema_str) ds_id = create_simple_unique_id() ds_filename = self.container.file_system.get( "%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id)) self.hdf5_persist = DatasetHDF5Persistence.get_persistence( ds_id, ds_schema, "hdf5") self.hdf5_persist.require_dataset() self.assertTrue(os.path.exists(ds_filename)) self.addCleanup(os.remove, ds_filename) # Add 100 values in packets of 10 (right up to the prune trigger) for i in xrange(10): packet = self._get_data_packet(i * 10, 10) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 100) self.assertEqual(len(data_res["var1"]), 100) self.assertEqual(len(data_res["random1"]), 100) self.assertEqual(data_res["var1"][1], 1.0) log.info("*** STEP 2: First prune") # Add 2 values (stepping across the prune trigger - inclusive boundary) packet = self._get_data_packet(100, 2) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 51) self.assertEqual(len(data_res["var1"]), 51) self.assertEqual(len(data_res["random1"]), 51) self.assertEqual(data_res["var1"][0], 51.0) self.assertEqual(data_res["var1"][50], 101.0) log.info("*** STEP 3: Additional data") # Add 100 values in packets of 10 (right up to the prune trigger) packet = self._get_data_packet(102, 8) self.hdf5_persist.extend_dataset(packet) for i in xrange(4): packet = self._get_data_packet(110 + i * 10, 10) self.hdf5_persist.extend_dataset(packet) packet = self._get_data_packet(150, 2) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 101) self.assertEqual(data_res["var1"][0], 51.0) self.assertEqual(data_res["var1"][100], 151.0) log.info("*** STEP 4: Second prune") packet = self._get_data_packet(152, 1) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 51) self.assertEqual(data_res["var1"][0], 102.0) self.assertEqual(data_res["var1"][50], 152.0) log.info("*** STEP 5: Third prune") packet = self._get_data_packet(153, 100) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 51) self.assertEqual(data_res["var1"][0], 202.0) self.assertEqual(data_res["var1"][50], 252.0)
def test_hdf5_persist(self): # Test HDF5 writing, time indexing, array extension etc ds_schema_str = """ type: scion_data_schema_1 description: Schema for test datasets attributes: basic_shape: 1d_timeseries time_variable: time persistence: format: hdf5 layout: vars_individual row_increment: 1000 time_index_step: 1000 variables: - name: time base_type: ntp_time storage_dtype: i8 unit: "" description: NTPv4 timestamp - name: var1 base_type: float storage_dtype: f8 unit: "" description: Sample value - name: random1 base_type: float storage_dtype: f8 unit: "" description: Random values """ ds_schema = yaml.load(ds_schema_str) ds_id = create_simple_unique_id() ds_filename = self.container.file_system.get("%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id)) self.hdf5_persist = DatasetHDF5Persistence.get_persistence(ds_id, ds_schema, "hdf5") self.hdf5_persist.require_dataset() self.assertTrue(os.path.exists(ds_filename)) self.addCleanup(os.remove, ds_filename) # Add 100 values in packets of 10 for i in xrange(10): packet = self._get_data_packet(i*10, 10) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res), 3) self.assertEqual(len(data_res["time"]), 100) self.assertEqual(len(data_res["var1"]), 100) self.assertEqual(len(data_res["random1"]), 100) self.assertEqual(data_res["var1"][1], 1.0) with HDFLockingFile(ds_filename, "r") as hdff: ds_time = hdff["vars/time"] cur_idx = ds_time.attrs["cur_row"] self.assertEqual(cur_idx, 100) self.assertEqual(len(ds_time), 1000) ds_tidx = hdff[DS_TIMEIDX_PATH] cur_tidx = ds_tidx.attrs["cur_row"] self.assertEqual(cur_tidx, 1) self.assertEqual(len(ds_tidx), 1000) # Add 1000 values in packets of 10 for i in xrange(100): packet = self._get_data_packet(100 + i*10, 10) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 1100) with HDFLockingFile(ds_filename, "r") as hdff: ds_time = hdff["vars/time"] cur_idx = ds_time.attrs["cur_row"] self.assertEqual(cur_idx, 1100) self.assertEqual(len(ds_time), 2000) ds_tidx = hdff[DS_TIMEIDX_PATH] cur_tidx = ds_tidx.attrs["cur_row"] self.assertEqual(cur_tidx, 2) self.assertEqual(len(ds_tidx), 1000) self.assertEqual(ds_time[0], ds_tidx[0][0]) self.assertEqual(ds_time[1000], ds_tidx[1][0]) info_res = self.hdf5_persist.get_data_info() self.assertEqual(info_res["ds_rows"], 1100) self.assertEqual(info_res["ts_first"], 1000000000.0) self.assertEqual(info_res["ts_last"], 1000010990.0)
def test_hdf5_persist_prune(self): # Test auto-pruning ds_schema_str = """ type: scion_data_schema_1 description: Schema for test datasets attributes: basic_shape: 1d_timeseries time_variable: time persistence: format: hdf5 layout: vars_individual row_increment: 1000 time_index_step: 1000 pruning: trigger_mode: on_ingest prune_mode: max_age_rel prune_action: rewrite trigger_age: 1000.0 retain_age: 500.0 variables: - name: time base_type: ntp_time storage_dtype: i8 unit: "" description: NTPv4 timestamp - name: var1 base_type: float storage_dtype: f8 unit: "" description: Sample value - name: random1 base_type: float storage_dtype: f8 unit: "" description: Random values """ ds_schema = yaml.load(ds_schema_str) ds_id = create_simple_unique_id() ds_filename = self.container.file_system.get("%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id)) self.hdf5_persist = DatasetHDF5Persistence.get_persistence(ds_id, ds_schema, "hdf5") self.hdf5_persist.require_dataset() self.assertTrue(os.path.exists(ds_filename)) self.addCleanup(os.remove, ds_filename) # Add 100 values in packets of 10 (right up to the prune trigger) for i in xrange(10): packet = self._get_data_packet(i * 10, 10) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 100) self.assertEqual(len(data_res["var1"]), 100) self.assertEqual(len(data_res["random1"]), 100) self.assertEqual(data_res["var1"][1], 1.0) log.info("*** STEP 2: First prune") # Add 2 values (stepping across the prune trigger - inclusive boundary) packet = self._get_data_packet(100, 2) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 51) self.assertEqual(len(data_res["var1"]), 51) self.assertEqual(len(data_res["random1"]), 51) self.assertEqual(data_res["var1"][0], 51.0) self.assertEqual(data_res["var1"][50], 101.0) log.info("*** STEP 3: Additional data") # Add 100 values in packets of 10 (right up to the prune trigger) packet = self._get_data_packet(102, 8) self.hdf5_persist.extend_dataset(packet) for i in xrange(4): packet = self._get_data_packet(110 + i * 10, 10) self.hdf5_persist.extend_dataset(packet) packet = self._get_data_packet(150, 2) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 101) self.assertEqual(data_res["var1"][0], 51.0) self.assertEqual(data_res["var1"][100], 151.0) log.info("*** STEP 4: Second prune") packet = self._get_data_packet(152, 1) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 51) self.assertEqual(data_res["var1"][0], 102.0) self.assertEqual(data_res["var1"][50], 152.0) log.info("*** STEP 5: Third prune") packet = self._get_data_packet(153, 100) self.hdf5_persist.extend_dataset(packet) data_res = self.hdf5_persist.get_data() self.assertEqual(len(data_res["time"]), 51) self.assertEqual(data_res["var1"][0], 202.0) self.assertEqual(data_res["var1"][50], 252.0)