def test_time_string_fidelity(self): it1 = NTP4Time() ntp_str = it1.to_string() it2 = NTP4Time.from_string(ntp_str) self.assertEquals(it1.seconds,it2.seconds) self.assertTrue(np.abs(it1.useconds - it2.useconds) <= 1)
def _prune_dataset(self, data_file): if not self.prune_mode: return if self.prune_mode == "max_age_rel": # Prunes if first timestamp older than trigger compared to most recent timestamp trigger_age = float(self.pruning_attrs.get("trigger_age", 0)) retain_age = float(self.pruning_attrs.get("retain_age", 0)) if trigger_age <= 0.0 or retain_age <= 0.0 or trigger_age < retain_age: raise BadRequest("Bad pruning trigger_age or retain_age") var_ds = data_file["vars/%s" % self.time_var] cur_idx = var_ds.attrs["cur_row"] if not len(var_ds) or not cur_idx: return min_ts = NTP4Time.from_ntp64(var_ds[0].tostring()).to_unix() max_ts = NTP4Time.from_ntp64(var_ds[cur_idx - 1].tostring()).to_unix() if min_ts + trigger_age >= max_ts: return # Find the first index that is lower or equal to retain_age and delete gap start_time = (max_ts - retain_age) * 1000 log.info("PRUNING dataset now: mode=%s, start_time=%s", self.prune_mode, int(start_time)) copy_filename = self._get_data_copy( data_file, data_filter=dict(start_time=start_time)) elif self.prune_mode == "max_age_abs": # Prunes if first timestamp older than trigger compared to current timestamp raise NotImplementedError() elif self.prune_mode == "max_rows": raise NotImplementedError() elif self.prune_mode == "max_size": raise NotImplementedError() else: raise BadRequest("Invalid prune_mode: %s" % self.prune_mode) if not copy_filename: return # Do the replace of data file with the copy. # Make sure to heed race conditions so that waiting processes won't lock the file first ds_filename = self._get_ds_filename() ds_filename_bak = ds_filename + ".bak" if os.path.exists(ds_filename_bak): os.remove(ds_filename_bak) data_file.close( ) # Note: Inter-process race condition possible because close removes the lock shutil.move(ds_filename, ds_filename_bak) shutil.move( copy_filename, ds_filename) # Note: There may be a cross-device-link error here # os.remove(ds_filename_bak) log.info("Pruning successful. Replaced dataset with pruned file.") return True
def gte_time(data_val, cmp_val, allow_equal=True): # Support NTP4 timestamp and Unit millis (i8) if time_type == "ntp_time": if allow_equal: return NTP4Time.from_ntp64(data_val.tostring()).to_unix() >= cmp_val else: return NTP4Time.from_ntp64(data_val.tostring()).to_unix() > cmp_val else: if allow_equal: return data_val >= cmp_val else: return data_val > cmp_val
def gte_time(data_val, cmp_val, allow_equal=True): # Support NTP4 timestamp and Unit millis (i8) if time_type == "ntp_time": if allow_equal: return NTP4Time.from_ntp64( data_val.tostring()).to_unix() >= cmp_val else: return NTP4Time.from_ntp64( data_val.tostring()).to_unix() > cmp_val else: if allow_equal: return data_val >= cmp_val else: return data_val > cmp_val
def get_data_info(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_info = {} max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] res_info["ds_rows"] = cur_idx res_info["ds_size"] = len(ds_time) res_info["file_size"] = os.path.getsize(ds_filename) res_info["file_name"] = ds_filename res_info["vars"] = list(data_file["vars"]) start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) res_info["need_expand"] = self.expand_info.get("need_expand", False) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion res_info["should_decimate"] = should_decimate res_info["need_decimate"] = bool(should_decimate and end_row - start_row > max_rows) res_info["ts_first"] = NTP4Time.from_ntp64(ds_time.value[0].tostring()).to_unix() res_info["ts_last"] = NTP4Time.from_ntp64(ds_time.value[cur_idx - 1].tostring()).to_unix() res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] * 1000, local_time=False) res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] * 1000, local_time=False) res_info["ds_samples"] = cur_idx * self.expand_info["num_steps"] if res_info["need_expand"] else cur_idx res_info["filter_start_row"] = start_row res_info["filter_end_row"] = end_row res_info["filter_max_rows"] = max_rows res_info["filter_ts_first"] = NTP4Time.from_ntp64(ds_time.value[start_row].tostring()).to_unix() res_info["filter_ts_last"] = NTP4Time.from_ntp64(ds_time.value[end_row - 1].tostring()).to_unix() res_info["filter_ts_first_str"] = get_datetime_str(res_info["filter_ts_first"] * 1000, local_time=False) res_info["filter_ts_last_str"] = get_datetime_str(res_info["filter_ts_last"] * 1000, local_time=False) return res_info finally: data_file.close()
def _prune_dataset(self, data_file): if not self.prune_mode: return if self.prune_mode == "max_age_rel": # Prunes if first timestamp older than trigger compared to most recent timestamp trigger_age = float(self.pruning_attrs.get("trigger_age", 0)) retain_age = float(self.pruning_attrs.get("retain_age", 0)) if trigger_age <= 0.0 or retain_age <= 0.0 or trigger_age < retain_age: raise BadRequest("Bad pruning trigger_age or retain_age") var_ds = data_file["vars/%s" % self.time_var] cur_idx = var_ds.attrs["cur_row"] if not len(var_ds) or not cur_idx: return min_ts = NTP4Time.from_ntp64(var_ds[0].tostring()).to_unix() max_ts = NTP4Time.from_ntp64(var_ds[cur_idx - 1].tostring()).to_unix() if min_ts + trigger_age >= max_ts: return # Find the first index that is lower or equal to retain_age and delete gap start_time = (max_ts - retain_age) * 1000 log.info("PRUNING dataset now: mode=%s, start_time=%s", self.prune_mode, int(start_time)) copy_filename = self._get_data_copy(data_file, data_filter=dict(start_time=start_time)) elif self.prune_mode == "max_age_abs": # Prunes if first timestamp older than trigger compared to current timestamp raise NotImplementedError() elif self.prune_mode == "max_rows": raise NotImplementedError() elif self.prune_mode == "max_size": raise NotImplementedError() else: raise BadRequest("Invalid prune_mode: %s" % self.prune_mode) if not copy_filename: return # Do the replace of data file with the copy. # Make sure to heed race conditions so that waiting processes won't lock the file first ds_filename = self._get_ds_filename() ds_filename_bak = ds_filename + ".bak" if os.path.exists(ds_filename_bak): os.remove(ds_filename_bak) data_file.close() # Note: Inter-process race condition possible because close removes the lock shutil.move(ds_filename, ds_filename_bak) shutil.move(copy_filename, ds_filename) # Note: There may be a cross-device-link error here # os.remove(ds_filename_bak) log.info("Pruning successful. Replaced dataset with pruned file.") return True
def acquire_samples(self, max_samples=0): sample = [NTP4Time.utcnow().to_ntp64(), psutil.cpu_percent()] sample_desc = dict(cols=["time", "cpu_percent"], data=[sample]) return sample_desc
def build_packet_from_samples(cls, samples, **kwargs): num_samples = len(samples["data"]) dtype_parts = [] for coldef in samples["cols"]: if coldef == "time": dtype_parts.append((coldef, "i8")) elif "coltypes" in samples and coldef in samples["coltypes"]: dtype_parts.append((coldef, samples["coltypes"][coldef])) else: dtype_parts.append((coldef, "f8")) dt = np.dtype(dtype_parts) data_array = np.zeros(num_samples, dtype=dt) for row_num, data_row in enumerate(samples["data"]): row_tuple = tuple( NTP4Time.np_from_string(dv) if isinstance(dv, basestring ) else dv for dv in data_row) data_array[row_num] = np.array(row_tuple, dtype=dt) data = samples.copy() data["data"] = data_array new_packet = DataPacket(ts_created=get_ion_ts(), data=data) for attr in new_packet.__dict__.keys(): if attr in ('data', 'ts_created'): continue if attr in kwargs: setattr(new_packet, attr, kwargs[attr]) return new_packet
def test_ntp_compatability(self): unix_day = NTP4Time(datetime.datetime(1970, 1, 1)) self.assertEquals(unix_day.era, 0) self.assertEquals(unix_day.seconds, 2208988800) utc_day = NTP4Time(datetime.datetime(1972, 1, 1)) self.assertEquals(utc_day.era, 0) self.assertEquals(utc_day.seconds, 2272060800) millen_day = NTP4Time(datetime.datetime(2000, 1, 1)) self.assertEquals(millen_day.era, 0) self.assertEquals(millen_day.seconds, 3155673600) ntp_era1 = NTP4Time(datetime.datetime(2036, 2, 8)) self.assertEquals(ntp_era1.era, 1) self.assertEquals(ntp_era1.seconds, 63104) self.assertEquals(ntp_era1.to_unix(), 2086041600.)
def _extract_row(self, pkt, cols): row = [] for c in cols: for ch in pkt['channels']: if ch['chan'] == c: row.append(tuple(ch['data'])) break orbtime = pkt['channels'][0]['time'] row.append(NTP4Time(orbtime).to_ntp64()) return row
def acquire_samples(self, max_samples=0): ts = time.time() sample = [NTP4Time.utcnow().to_ntp64(), 20 * math.sin(10 * ts) + 5, 10 * math.sin(15 * ts) + 10, random.random()*100] sample_desc = dict(cols=["time", "wave1", "wave2", "random1"], data=[sample]) return sample_desc
def acquire_samples(self, max_samples=0): log.debug('CDIP_DataAgentPlugin.acquire_samples') # Read server, extract last sample. data = requests.get(self.streaming_args.url) m = None for m in re.finditer(pattern, data.text, flags=re.MULTILINE): pass if not m: log.warning('CDIP_DataAgentPlugin.acquire_samples: No data found.') return None year = int(m.group(1)) month = int(m.group(2)) day = int(m.group(3)) hour = int(m.group(4)) minute = int(m.group(5)) Hs = float(m.group(6)) Tp = float(m.group(7)) Dp = int(m.group(8)) Ta = float(m.group(9)) Temp = float(m.group(10)) # Create sample. # [ntp64_ts, Hs, Tp, Dp, Ta, Temp] # ['\xdb\x07\x00,\x00\x00\x00\x00', 2.66, 9.09, 328, 6.67, 12.2] dt = datetime.datetime(year, month, day, hour, minute) ts = NTP4Time(dt).to_ntp64() sample = [ts, Hs, Tp, Dp, Ta, Temp] # Compare to last reading. if self.last_sample == sample: log.debug('CDIP_DataAgentPlugin.acquire_samples: No new data.') return None # Update, pack and return. log.debug('CDIP_DataAgentPlugin.acquire_samples: Got new data.') log.debug('CDIP data: %s' % str(sample)) self.last_sample = sample sample_desc = dict(cols=["time", "Hs", "Tp", "Dp", "Ta", "Temp"], data=[sample]) return sample_desc
def acquire_samples(self, max_samples=0): if len(self.samples) <= self.sample_index: log.warn("Out of samples at index %s", self.sample_index) self.sample_index += 1 return None data_row = self.samples[self.sample_index] self.sample_index += 1 sample = [ NTP4Time(data_row["time"]).to_ntp64(), tuple(data_row["sample_vector"]) ] sample_desc = dict(cols=["time", "sample_vector"], coltypes=dict(sample_vector="10i2"), data=[sample]) print sample_desc return sample_desc
def build_packet_from_samples(cls, samples, **kwargs): num_samples = len(samples["data"]) dtype_parts = [] for coldef in samples["cols"]: if coldef == "time": dtype_parts.append((coldef, "i8")) else: dtype_parts.append((coldef, "f8")) dt = np.dtype(dtype_parts) data_array = np.zeros(num_samples, dtype=dt) for row_num, data_row in enumerate(samples["data"]): row_tuple = tuple(NTP4Time.np_from_string(dv) if isinstance(dv, basestring) else dv for dv in data_row) data_array[row_num] = np.array(row_tuple, dtype=dt) data = samples.copy() data["data"] = data_array new_packet = DataPacket(ts_created=get_ion_ts(), data=data) for attr in new_packet.__dict__.keys(): if attr in ('data', 'ts_created'): continue if attr in kwargs: setattr(new_packet, attr, kwargs[attr]) return new_packet
def _get_data_packet(self, index, num_rows=1): """ Return a data packet with number of samples. The index indicates the offset from the starting timestamp, 10 sec per sample.""" base_ts = 1000000000 index_ts = base_ts + 10 * index # Core samples as provided by agent.acquire_samples sample_list = [] for i in xrange(num_rows): ts = index_ts + i * 10 sample = [ NTP4Time(ts).to_ntp64(), float(index + i), random.random() * 100 ] sample_list.append(sample) sample_desc = dict(cols=["time", "var1", "random1"], data=sample_list) packet = DataPacketBuilder.build_packet_from_samples( sample_desc, resource_id="ds_id", stream_name="basic_streams") return packet
def extend_dataset(self, packet): """ Adds values from a data packet to the dataset and updates indexes and metadata """ ingest_ts = NTP4Time.utcnow() num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, [] ds_filename = self._get_ds_filename() data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5) file_closed = False try: if self.ds_layout == DS_LAYOUT_INDIVIDUAL: # Get index values from time var if self.time_var not in packet.data["cols"]: raise BadRequest("Packet has no time") var_ds = data_file["vars/%s" % self.time_var] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] var_ds.attrs["cur_row"] += num_rows # Fill variables with values from packet or NaN for var_name in self.var_defs_map.keys(): var_ds = data_file["vars/%s" % var_name] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) if var_name in packet.data["cols"]: data_slice = packet.data["data"][:][var_name] var_ds[cur_idx:cur_idx + num_rows] = data_slice else: # Leave the initial fill value (zeros) #var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows pass extra_vars = set(packet.data["cols"]) - set( self.var_defs_map.keys()) if extra_vars: log.warn("Data packet had extra vars not in dataset: %s", extra_vars) elif self.ds_layout == DS_LAYOUT_COMBINED: var_ds = data_file["vars/%s" % DS_VARIABLES] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) ds_var_names = [var_info["name"] for var_info in self.var_defs] pvi = { col_name: col_idx for col_idx, col_name in enumerate(packet.data["cols"]) if col_name in ds_var_names } for row_idx in xrange(num_rows): row_data = packet.data["data"][row_idx] row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names) var_ds[cur_idx + row_idx] = row_vals var_ds.attrs["cur_row"] += num_rows # Update time_ingest (ts, begin row, count) ds_tingest = data_file[DS_TIMEINGEST_PATH] if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest): self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT) ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows) ds_tingest.attrs["cur_row"] += 1 # Update time index self._update_time_index(data_file, num_rows, cur_idx=cur_idx) # Check if pruning is necessary if self.prune_trigger_mode == "on_ingest" and self.prune_mode: file_closed = self._prune_dataset(data_file) #HDF5Tools.dump_hdf5(data_file, with_data=True) except Exception: log.exception("Error extending dataset %s HDF5 file" % self.dataset_id) raise finally: if not file_closed: data_file.close()
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] time_format = data_filter.get("time_format", "unix_millis") max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True time_slice = None start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) log.info("Row date interval: %s : %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion if end_row-start_row > max_rows: log.info("Truncating %s rows to %s max rows (from the end)", end_row-start_row, max_rows) if self.ds_layout == DS_LAYOUT_INDIVIDUAL: ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] data_array = ds_var[max(start_row, end_row-max_rows, 0):end_row] if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array # At this point we have dict with variable to data array mapping with target (unix) timestamps self._expand_packed_rows(res_data, data_filter) if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)] # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array elif self.ds_layout == DS_LAYOUT_COMBINED: raise NotImplementedError() return res_data finally: data_file.close()
def extend_dataset(self, packet): """ Adds values from a data packet to the dataset and updates indexes and metadata """ ingest_ts = NTP4Time.utcnow() num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, [] ds_filename = self._get_ds_filename() data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5) file_closed = False try: if self.ds_layout == DS_LAYOUT_INDIVIDUAL: # Get index values from time var if self.time_var not in packet.data["cols"]: raise BadRequest("Packet has no time") var_ds = data_file["vars/%s" % self.time_var] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] var_ds.attrs["cur_row"] += num_rows # Fill variables with values from packet or NaN for var_name in self.var_defs_map.keys(): var_ds = data_file["vars/%s" % var_name] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) if var_name in packet.data["cols"]: data_slice = packet.data["data"][:][var_name] var_ds[cur_idx : cur_idx + num_rows] = data_slice else: # Leave the initial fill value (zeros) # var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows pass extra_vars = set(packet.data["cols"]) - set(self.var_defs_map.keys()) if extra_vars: log.warn("Data packet had extra vars not in dataset: %s", extra_vars) elif self.ds_layout == DS_LAYOUT_COMBINED: var_ds = data_file["vars/%s" % DS_VARIABLES] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) ds_var_names = [var_info["name"] for var_info in self.var_defs] pvi = { col_name: col_idx for col_idx, col_name in enumerate(packet.data["cols"]) if col_name in ds_var_names } for row_idx in xrange(num_rows): row_data = packet.data["data"][row_idx] row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names) var_ds[cur_idx + row_idx] = row_vals var_ds.attrs["cur_row"] += num_rows # Update time_ingest (ts, begin row, count) ds_tingest = data_file[DS_TIMEINGEST_PATH] if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest): self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT) ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows) ds_tingest.attrs["cur_row"] += 1 # Update time index self._update_time_index(data_file, num_rows, cur_idx=cur_idx) # Check if pruning is necessary if self.prune_trigger_mode == "on_ingest" and self.prune_mode: file_closed = self._prune_dataset(data_file) # HDF5Tools.dump_hdf5(data_file, with_data=True) except Exception: log.exception("Error extending dataset %s HDF5 file" % self.dataset_id) raise finally: if not file_closed: data_file.close()
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] time_format = data_filter.get("time_format", "unix_millis") max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True time_slice = None start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include) log.info("Get data for row interval %s to %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info["num_steps"] # Compensate expansion if end_row - start_row > max_rows: if should_decimate: log.info("Decimating %s rows to satisfy %s max rows", end_row - start_row, max_rows_org) else: log.info( "Truncating %s rows to %s max rows, %s unexpanded", end_row - start_row, max_rows_org, max_rows ) if self.ds_layout != DS_LAYOUT_INDIVIDUAL: raise NotImplementedError() ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] start_row_act = start_row if should_decimate else max(start_row, end_row - max_rows, 0) data_array = ds_var[start_row_act:end_row] if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [int(1000 * NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps self._expand_packed_rows(res_data, data_filter) self._decimate_rows(res_data, data_filter) if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)] return res_data finally: data_file.close()
def test_unix_time_fidelity(self): ts = time.time() it1 = NTP4Time(ts) ts_2 = it1.to_unix() self.assertTrue(np.abs(ts - ts_2) <= 1e-3)
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [ var_info["name"] for var_info in self.var_defs ] time_format = data_filter.get("time_format", "unix_millis") max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True time_slice = None start_row, end_row = self._get_row_interval( data_file, start_time, end_time, start_time_include) log.info("Get data for row interval %s to %s", start_row, end_row) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info[ "num_steps"] # Compensate expansion if end_row - start_row > max_rows: if should_decimate: log.info("Decimating %s rows to satisfy %s max rows", end_row - start_row, max_rows_org) else: log.info( "Truncating %s rows to %s max rows, %s unexpanded", end_row - start_row, max_rows_org, max_rows) if self.ds_layout != DS_LAYOUT_INDIVIDUAL: raise NotImplementedError() ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue ds_var = data_file[ds_path] start_row_act = start_row if should_decimate else max( start_row, end_row - max_rows, 0) data_array = ds_var[start_row_act:end_row] if var_name == self.time_var and self.var_defs_map[ var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [ int(1000 * NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array ] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps self._expand_packed_rows(res_data, data_filter) self._decimate_rows(res_data, data_filter) if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [ (tv, dv) for (tv, dv) in zip(time_series, var_series) ] return res_data finally: data_file.close()
def get_data_info(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_info = {} max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS) start_time = data_filter.get("start_time", None) end_time = data_filter.get("end_time", None) start_time_include = data_filter.get("start_time_include", True) is True should_decimate = data_filter.get("decimate", False) is True ds_time = data_file["vars/%s" % self.time_var] cur_idx = ds_time.attrs["cur_row"] res_info["ds_rows"] = cur_idx res_info["ds_size"] = len(ds_time) res_info["file_size"] = os.path.getsize(ds_filename) res_info["file_name"] = ds_filename res_info["vars"] = list(data_file["vars"]) start_row, end_row = self._get_row_interval( data_file, start_time, end_time, start_time_include) res_info["need_expand"] = self.expand_info.get( "need_expand", False) if self.expand_info.get("need_expand", False): max_rows = max_rows / self.expand_info[ "num_steps"] # Compensate expansion res_info["should_decimate"] = should_decimate res_info["need_decimate"] = bool( should_decimate and end_row - start_row > max_rows) res_info["ts_first"] = NTP4Time.from_ntp64( ds_time.value[0].tostring()).to_unix() res_info["ts_last"] = NTP4Time.from_ntp64( ds_time.value[cur_idx - 1].tostring()).to_unix() res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] * 1000, local_time=False) res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] * 1000, local_time=False) res_info["ds_samples"] = cur_idx * self.expand_info[ "num_steps"] if res_info["need_expand"] else cur_idx res_info["filter_start_row"] = start_row res_info["filter_end_row"] = end_row res_info["filter_max_rows"] = max_rows res_info["filter_ts_first"] = NTP4Time.from_ntp64( ds_time.value[start_row].tostring()).to_unix() res_info["filter_ts_last"] = NTP4Time.from_ntp64( ds_time.value[end_row - 1].tostring()).to_unix() res_info["filter_ts_first_str"] = get_datetime_str( res_info["filter_ts_first"] * 1000, local_time=False) res_info["filter_ts_last_str"] = get_datetime_str( res_info["filter_ts_last"] * 1000, local_time=False) return res_info finally: data_file.close()
def extend_dataset(self, packet): """ Adds values from a data packet to the dataset and updates indexes and metadata """ ingest_ts = NTP4Time.utcnow() num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, [] ds_filename = self._get_ds_filename() data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5) try: if self.ds_layout == DS_LAYOUT_INDIVIDUAL: # Get index values from time var if self.time_var not in packet.data["cols"]: raise BadRequest("Packet has no time") var_ds = data_file["vars/%s" % self.time_var] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] var_ds.attrs["cur_row"] += num_rows # Fill variables with values from packet or NaN for var_name in self.var_defs_map.keys(): var_ds = data_file["vars/%s" % var_name] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) if var_name in packet.data["cols"]: data_slice = packet.data["data"][:][var_name] var_ds[cur_idx:cur_idx+num_rows] = data_slice else: # Leave the initial fill value (zeros) #var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows pass extra_vars = set(packet.data["cols"]) - set(self.var_defs_map.keys()) if extra_vars: log.warn("Data packet had extra vars not in dataset: %s", extra_vars) elif self.ds_layout == DS_LAYOUT_COMBINED: var_ds = data_file["vars/%s" % DS_VARIABLES] cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"] if cur_idx + num_rows > cur_size: self._resize_dataset(var_ds, num_rows) ds_var_names = [var_info["name"] for var_info in self.var_defs] pvi = {col_name: col_idx for col_idx, col_name in enumerate(packet.data["cols"]) if col_name in ds_var_names} for row_idx in xrange(num_rows): row_data = packet.data["data"][row_idx] row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names) var_ds[cur_idx+row_idx] = row_vals var_ds.attrs["cur_row"] += num_rows # Update time_ingest (ts, begin row, count) ds_tingest = data_file[DS_TIMEINGEST_PATH] if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest): self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT) ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows) ds_tingest.attrs["cur_row"] += 1 # Update time_idx (every nth row's time) new_idx_row = (cur_idx + num_rows + self.time_idx_step - 1) / self.time_idx_step old_idx_row = (cur_idx + self.time_idx_step - 1) / self.time_idx_step num_tidx_rows = new_idx_row - old_idx_row time_ds = data_file["vars/%s" % (self.time_var if self.ds_layout == DS_LAYOUT_INDIVIDUAL else DS_VARIABLES)] time_idx_rows = [time_ds[idx_row*self.time_idx_step] for idx_row in xrange(old_idx_row, new_idx_row)] if time_idx_rows: ds_tidx = data_file[DS_TIMEIDX_PATH] tidx_cur_row = ds_tidx.attrs["cur_row"] if tidx_cur_row + num_tidx_rows > len(ds_tidx): self._resize_dataset(ds_tidx, num_tidx_rows, INTERNAL_ROW_INCREMENT) ds_tidx[tidx_cur_row:tidx_cur_row+num_tidx_rows] = time_idx_rows ds_tidx.attrs["cur_row"] += num_tidx_rows #HDF5Tools.dump_hdf5(data_file, with_data=True) finally: data_file.close()
def test_time_ntp_fidelity(self): it1 = NTP4Time() ntp_ts = it1.to_ntp64() it2 = NTP4Time.from_ntp64(ntp_ts) self.assertEquals(it1.seconds,it2.seconds) self.assertTrue(np.abs(it1.useconds - it2.useconds) <= 1)
def get_data(self, data_filter=None): data_filter = data_filter or {} ds_filename = self._get_ds_filename() if not os.path.exists(ds_filename): return {} data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2) try: res_data = {} read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs] time_format = data_filter.get("time_format", "unix_millis") max_rows = data_filter.get("max_rows", 999999999) time_slice = None if self.ds_layout == DS_LAYOUT_INDIVIDUAL: time_ds = data_file["vars/%s" % self.time_var] cur_idx = time_ds.attrs["cur_row"] for var_name in read_vars: ds_path = "vars/%s" % var_name if ds_path not in data_file: log.warn("Variable '%s' not in dataset - ignored", var_name) continue var_ds = data_file[ds_path] data_array = var_ds[max(cur_idx-max_rows, 0):cur_idx] if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time": if time_format == "unix_millis": data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array] else: data_array = data_array.tolist() else: data_array = data_array.tolist() if var_name == self.time_var: time_slice = data_array res_data[var_name] = data_array if data_filter.get("transpose_time", False) is True: time_series = res_data.pop(self.time_var) for var_name, var_series in res_data.iteritems(): res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)] # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array elif self.ds_layout == DS_LAYOUT_COMBINED: raise NotImplementedError() start_time = data_filter.get("start_time", None) start_time_include = data_filter.get("start_time_include", True) is True if time_slice and res_data and start_time: start_time = int(start_time) time_idx = len(time_slice) for idx, tv in enumerate(time_slice): if tv == start_time and start_time_include: time_idx = idx break elif tv > start_time: time_idx = idx break for var_name, var_series in res_data.iteritems(): res_data[var_name] = var_series[time_idx:] return res_data finally: data_file.close()