Пример #1
0
    def dump_hdf5(cls, data_file, leave_open=False, with_data=False):
        should_close = False
        if isinstance(data_file, basestring) and os.path.exists(data_file):
            filename = data_file
            data_file = HDFLockingFile(data_file, "r", retry_count=10, retry_wait=0.5)
            should_close = True
            print "HDF5", filename, data_file

        else:
            print "HDF5", data_file

        def dump_item(entry_name):
            parts = entry_name.split("/")
            entry = data_file[entry_name]
            ilevel = len(parts)
            print "%s%s %s" % ("  "*ilevel, parts[-1], entry)
            if entry.attrs:
                print "%s  [%s]" % ("  "*ilevel, ", ".join("%s=%s" % (k, v) for (k, v) in entry.attrs.iteritems()))

            if with_data and hasattr(entry, "value"):
                print "%s  %s" % ("  "*ilevel, entry.value)

        data_file.visit(dump_item)

        if should_close and not leave_open:
            data_file.close()

        return data_file
Пример #2
0
 def get_data_copy(self, data_filter=None):
     data_filter = data_filter or {}
     ds_filename = self._get_ds_filename()
     if not os.path.exists(ds_filename):
         return {}
     data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
     try:
         return self._get_data_copy(data_file, data_filter=data_filter)
     finally:
         data_file.close()
Пример #3
0
 def get_data_copy(self, data_filter=None):
     data_filter = data_filter or {}
     ds_filename = self._get_ds_filename()
     if not os.path.exists(ds_filename):
         return {}
     data_file = HDFLockingFile(ds_filename,
                                "r",
                                retry_count=10,
                                retry_wait=0.2)
     try:
         return self._get_data_copy(data_file, data_filter=data_filter)
     finally:
         data_file.close()
Пример #4
0
    def get_data_info(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_info = {}
            max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            should_decimate = data_filter.get("decimate", False) is True

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]

            res_info["ds_rows"] = cur_idx
            res_info["ds_size"] = len(ds_time)
            res_info["file_size"] = os.path.getsize(ds_filename)
            res_info["file_name"] = ds_filename
            res_info["vars"] = list(data_file["vars"])

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            res_info["need_expand"] = self.expand_info.get("need_expand", False)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion
            res_info["should_decimate"] = should_decimate
            res_info["need_decimate"] = bool(should_decimate and end_row - start_row > max_rows)

            res_info["ts_first"] = NTP4Time.from_ntp64(ds_time.value[0].tostring()).to_unix()
            res_info["ts_last"] = NTP4Time.from_ntp64(ds_time.value[cur_idx - 1].tostring()).to_unix()
            res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] * 1000, local_time=False)
            res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] * 1000, local_time=False)

            res_info["ds_samples"] = cur_idx * self.expand_info["num_steps"] if res_info["need_expand"] else cur_idx

            res_info["filter_start_row"] = start_row
            res_info["filter_end_row"] = end_row
            res_info["filter_max_rows"] = max_rows
            res_info["filter_ts_first"] = NTP4Time.from_ntp64(ds_time.value[start_row].tostring()).to_unix()
            res_info["filter_ts_last"] = NTP4Time.from_ntp64(ds_time.value[end_row - 1].tostring()).to_unix()
            res_info["filter_ts_first_str"] = get_datetime_str(res_info["filter_ts_first"] * 1000, local_time=False)
            res_info["filter_ts_last_str"] = get_datetime_str(res_info["filter_ts_last"] * 1000, local_time=False)

            return res_info

        finally:
            data_file.close()
    def get_data_copy(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            log.info("ROW INTERVAL %s %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion

            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                ds_time = data_file["vars/%s" % self.time_var]
                cur_idx = ds_time.attrs["cur_row"]
                for var_name in read_vars:
                    ds_path = "vars/%s" % var_name
                    if ds_path not in data_file:
                        log.warn("Variable '%s' not in dataset - ignored", var_name)
                        continue
                    ds_var = data_file[ds_path]
                    data_array = ds_var[max(start_row, end_row-max_rows, 0):end_row]


            elif self.ds_layout == DS_LAYOUT_COMBINED:
                raise NotImplementedError()

            return res_data

        finally:
            data_file.close()
Пример #6
0
    def dump_hdf5(cls, data_file, leave_open=False, with_data=False, with_crow=True):
        should_close = False
        if isinstance(data_file, basestring) and os.path.exists(data_file):
            filename = data_file
            data_file = HDFLockingFile(data_file, "r", retry_count=10, retry_wait=0.5)
            should_close = True
            print "HDF5", filename, data_file

        else:
            print "HDF5", data_file

        def dump_item(entry_name):
            parts = entry_name.split("/")
            entry = data_file[entry_name]
            ilevel = len(parts)
            cur_row = None
            print "%s%s %s" % ("  " * ilevel, parts[-1], entry)
            if entry.attrs:
                print "%s  [%s]" % ("  " * ilevel, ", ".join("%s=%s" % (k, v) for (k, v) in entry.attrs.iteritems()))

            if with_data and hasattr(entry, "value"):
                cur_row = entry.attrs["cur_row"] if entry.attrs and "cur_row" in entry.attrs else None
                cur_row = cur_row or (
                    data_file["vars/time"].attrs["cur_row"] if entry_name.startswith("vars") else None
                )
                if with_crow and cur_row:
                    print "%s  %s (%s of %s)" % ("  " * ilevel, entry.value[:cur_row], cur_row, len(entry))
                else:
                    print "%s  %s" % ("  " * ilevel, entry.value)

        data_file.visit(dump_item)

        if should_close and not leave_open:
            data_file.close()

        return data_file
Пример #7
0
    def dump_hdf5(cls,
                  data_file,
                  leave_open=False,
                  with_data=False,
                  with_crow=True):
        should_close = False
        if isinstance(data_file, basestring) and os.path.exists(data_file):
            filename = data_file
            data_file = HDFLockingFile(data_file,
                                       "r",
                                       retry_count=10,
                                       retry_wait=0.5)
            should_close = True
            print "HDF5", filename, data_file

        else:
            print "HDF5", data_file

        def dump_item(entry_name):
            parts = entry_name.split("/")
            entry = data_file[entry_name]
            ilevel = len(parts)
            cur_row = None
            print "%s%s %s" % ("  " * ilevel, parts[-1], entry)
            if entry.attrs:
                print "%s  [%s]" % ("  " * ilevel, ", ".join(
                    "%s=%s" % (k, v) for (k, v) in entry.attrs.iteritems()))

            if with_data and hasattr(entry, "value"):
                cur_row = entry.attrs[
                    "cur_row"] if entry.attrs and "cur_row" in entry.attrs else None
                cur_row = cur_row or (data_file["vars/time"].attrs["cur_row"]
                                      if entry_name.startswith("vars") else
                                      None)
                if with_crow and cur_row:
                    print "%s  %s (%s of %s)" % ("  " * ilevel,
                                                 entry.value[:cur_row],
                                                 cur_row, len(entry))
                else:
                    print "%s  %s" % ("  " * ilevel, entry.value)

        data_file.visit(dump_item)

        if should_close and not leave_open:
            data_file.close()

        return data_file
Пример #8
0
    def extend_dataset(self, packet):
        """
        Adds values from a data packet to the dataset and updates indexes and metadata
        """
        ingest_ts = NTP4Time.utcnow()
        num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, []
        ds_filename = self._get_ds_filename()
        data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5)
        try:
            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                # Get index values from time var
                if self.time_var not in packet.data["cols"]:
                    raise BadRequest("Packet has no time")
                var_ds = data_file["vars/%s" % self.time_var]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                var_ds.attrs["cur_row"] += num_rows

                # Fill variables with values from packet or NaN
                for var_name in self.var_defs_map.keys():
                    var_ds = data_file["vars/%s" % var_name]
                    if cur_idx + num_rows > cur_size:
                        self._resize_dataset(var_ds, num_rows)
                    if var_name in packet.data["cols"]:
                        data_slice = packet.data["data"][:][var_name]
                        var_ds[cur_idx:cur_idx+num_rows] = data_slice
                    else:
                        # Leave the initial fill value (zeros)
                        #var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows
                        pass

                extra_vars = set(packet.data["cols"]) - set(self.var_defs_map.keys())
                if extra_vars:
                    log.warn("Data packet had extra vars not in dataset: %s", extra_vars)

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                var_ds = data_file["vars/%s" % DS_VARIABLES]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                if cur_idx + num_rows > cur_size:
                    self._resize_dataset(var_ds, num_rows)
                ds_var_names = [var_info["name"] for var_info in self.var_defs]
                pvi = {col_name: col_idx for col_idx, col_name in enumerate(packet.data["cols"]) if col_name in ds_var_names}
                for row_idx in xrange(num_rows):
                    row_data = packet.data["data"][row_idx]
                    row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names)
                    var_ds[cur_idx+row_idx] = row_vals
                var_ds.attrs["cur_row"] += num_rows

            # Update time_ingest (ts, begin row, count)
            ds_tingest = data_file[DS_TIMEINGEST_PATH]
            if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest):
                self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT)
            ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows)
            ds_tingest.attrs["cur_row"] += 1

            # Update time_idx (every nth row's time)
            new_idx_row = (cur_idx + num_rows + self.time_idx_step - 1) / self.time_idx_step
            old_idx_row = (cur_idx + self.time_idx_step - 1) / self.time_idx_step
            num_tidx_rows = new_idx_row - old_idx_row
            time_ds = data_file["vars/%s" % (self.time_var if self.ds_layout == DS_LAYOUT_INDIVIDUAL else DS_VARIABLES)]
            time_idx_rows = [time_ds[idx_row*self.time_idx_step] for idx_row in xrange(old_idx_row, new_idx_row)]
            if time_idx_rows:
                ds_tidx = data_file[DS_TIMEIDX_PATH]
                tidx_cur_row = ds_tidx.attrs["cur_row"]
                if tidx_cur_row + num_tidx_rows > len(ds_tidx):
                    self._resize_dataset(ds_tidx, num_tidx_rows, INTERNAL_ROW_INCREMENT)
                ds_tidx[tidx_cur_row:tidx_cur_row+num_tidx_rows] = time_idx_rows
                ds_tidx.attrs["cur_row"] += num_tidx_rows

            #HDF5Tools.dump_hdf5(data_file, with_data=True)
        finally:
            data_file.close()
Пример #9
0
    def get_data_info(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename,
                                   "r",
                                   retry_count=10,
                                   retry_wait=0.2)
        try:
            res_info = {}
            max_rows_org = max_rows = data_filter.get("max_rows",
                                                      DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include",
                                                 True) is True
            should_decimate = data_filter.get("decimate", False) is True

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]

            res_info["ds_rows"] = cur_idx
            res_info["ds_size"] = len(ds_time)
            res_info["file_size"] = os.path.getsize(ds_filename)
            res_info["file_name"] = ds_filename
            res_info["vars"] = list(data_file["vars"])

            start_row, end_row = self._get_row_interval(
                data_file, start_time, end_time, start_time_include)
            res_info["need_expand"] = self.expand_info.get(
                "need_expand", False)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info[
                    "num_steps"]  # Compensate expansion
            res_info["should_decimate"] = should_decimate
            res_info["need_decimate"] = bool(
                should_decimate and end_row - start_row > max_rows)

            res_info["ts_first"] = NTP4Time.from_ntp64(
                ds_time.value[0].tostring()).to_unix()
            res_info["ts_last"] = NTP4Time.from_ntp64(
                ds_time.value[cur_idx - 1].tostring()).to_unix()
            res_info["ts_first_str"] = get_datetime_str(res_info["ts_first"] *
                                                        1000,
                                                        local_time=False)
            res_info["ts_last_str"] = get_datetime_str(res_info["ts_last"] *
                                                       1000,
                                                       local_time=False)

            res_info["ds_samples"] = cur_idx * self.expand_info[
                "num_steps"] if res_info["need_expand"] else cur_idx

            res_info["filter_start_row"] = start_row
            res_info["filter_end_row"] = end_row
            res_info["filter_max_rows"] = max_rows
            res_info["filter_ts_first"] = NTP4Time.from_ntp64(
                ds_time.value[start_row].tostring()).to_unix()
            res_info["filter_ts_last"] = NTP4Time.from_ntp64(
                ds_time.value[end_row - 1].tostring()).to_unix()
            res_info["filter_ts_first_str"] = get_datetime_str(
                res_info["filter_ts_first"] * 1000, local_time=False)
            res_info["filter_ts_last_str"] = get_datetime_str(
                res_info["filter_ts_last"] * 1000, local_time=False)

            return res_info

        finally:
            data_file.close()
Пример #10
0
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename,
                                   "r",
                                   retry_count=10,
                                   retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [
                var_info["name"] for var_info in self.var_defs
            ]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows_org = max_rows = data_filter.get("max_rows",
                                                      DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include",
                                                 True) is True
            should_decimate = data_filter.get("decimate", False) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(
                data_file, start_time, end_time, start_time_include)
            log.info("Get data for row interval %s to %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info[
                    "num_steps"]  # Compensate expansion
            if end_row - start_row > max_rows:
                if should_decimate:
                    log.info("Decimating %s rows to satisfy %s max rows",
                             end_row - start_row, max_rows_org)
                else:
                    log.info(
                        "Truncating %s rows to %s max rows, %s unexpanded",
                        end_row - start_row, max_rows_org, max_rows)

            if self.ds_layout != DS_LAYOUT_INDIVIDUAL:
                raise NotImplementedError()

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]
            for var_name in read_vars:
                ds_path = "vars/%s" % var_name
                if ds_path not in data_file:
                    log.warn("Variable '%s' not in dataset - ignored",
                             var_name)
                    continue
                ds_var = data_file[ds_path]
                start_row_act = start_row if should_decimate else max(
                    start_row, end_row - max_rows, 0)
                data_array = ds_var[start_row_act:end_row]
                if var_name == self.time_var and self.var_defs_map[
                        var_name].get("base_type", "") == "ntp_time":
                    if time_format == "unix_millis":
                        data_array = [
                            int(1000 *
                                NTP4Time.from_ntp64(dv.tostring()).to_unix())
                            for dv in data_array
                        ]
                    else:
                        data_array = data_array.tolist()
                else:
                    data_array = data_array.tolist()
                if var_name == self.time_var:
                    time_slice = data_array

                res_data[var_name] = data_array

            # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps
            self._expand_packed_rows(res_data, data_filter)

            self._decimate_rows(res_data, data_filter)

            if data_filter.get("transpose_time", False) is True:
                time_series = res_data.pop(self.time_var)
                for var_name, var_series in res_data.iteritems():
                    res_data[var_name] = [
                        (tv, dv) for (tv, dv) in zip(time_series, var_series)
                    ]

            return res_data

        finally:
            data_file.close()
Пример #11
0
    def require_dataset(self, ds_filename=None):
        """
        Ensures a dataset HDF5 file exists and creates it if necessary usign the dataset
        schema definition.
        """
        ds_filename = ds_filename or self._get_ds_filename()
        if os.path.exists(ds_filename):
            return ds_filename, False

        log.info("Creating new HDF5 file for dataset_id=%s, file='%s'",
                 self.dataset_id, ds_filename)
        dir_path = os.path.split(ds_filename)[0]
        try:
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
        except OSError as exc:
            import errno
            if exc.errno == errno.EEXIST and os.path.isdir(dir_path):
                pass
            else:
                raise

        data_file = HDFLockingFile(ds_filename,
                                   "w",
                                   retry_count=10,
                                   retry_wait=0.5)
        try:
            data_file.attrs["dataset_id"] = self.dataset_id
            data_file.attrs["layout"] = self.ds_layout
            data_file.attrs["format"] = "scion_hdf5_v1"

            data_file.create_group("vars")
            initial_shape = (self.ds_increment, )

            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                # Individial layout means every variable has its own table - variable values
                # must be coindexed with the time values.
                # The time variable keeps the cur_row attribute which is the next writable index.
                # The length of tables is increased in configurable chunk sizes.
                for position, var_info in enumerate(self.var_defs):
                    var_name = var_info["name"]
                    base_type = var_info.get("base_type", "float")
                    dtype = var_info.get("storage_dtype", "f8")
                    dset = data_file.create_dataset("vars/%s" % var_name,
                                                    initial_shape,
                                                    dtype=dtype,
                                                    maxshape=(None, ))
                    dset.attrs["base_type"] = str(base_type)
                    dset.attrs["position"] = position
                    dset.attrs["description"] = str(
                        var_info.get("description", "") or "")
                    dset.attrs["unit"] = str(var_info.get("unit", "") or "")
                    if var_name == self.time_var:
                        dset.attrs["cur_row"] = 0

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                # EXPERIMENTAL - unsupported for most operations.
                # Combined layout means all variables are in one table of structured type.
                # The cur_row attribute keeps the number of next writable index.
                # The length of the table is increased in configurable chunk sizes.
                dtype_parts = []
                for var_info in self.var_defs:
                    var_name = var_info["name"]
                    base_type = var_info.get("base_type", "float")
                    dtype = var_info.get("storage_dtype", "f8")
                    dtype_parts.append((var_name, dtype))

                dset = data_file.create_dataset("vars/%s" % DS_VARIABLES,
                                                initial_shape,
                                                dtype=np.dtype(dtype_parts),
                                                maxshape=(None, ))
                dset.attrs["dtype_repr"] = repr(dset.dtype)[6:-1]
                dset.attrs["cur_row"] = 0

            # Internal time index - a table indexing every nth row's timestep.
            # Index table grows in constant defined chunk size.
            data_file.create_group("index")
            dtype_tidx = [("time", "i8")]
            ds_tidx = data_file.create_dataset(DS_TIMEIDX_PATH,
                                               (INTERNAL_ROW_INCREMENT, ),
                                               dtype=dtype_tidx,
                                               maxshape=(None, ))
            ds_tidx.attrs["cur_row"] = 0
            ds_tidx.attrs[
                "description"] = "Index of every %s-th time value" % self.time_idx_step
            ds_tidx.attrs["step"] = self.time_idx_step

            # Internal ingest time - table of 3 tuple with timestamp, start row and num rows for a packet
            # Index table grows in constant defined chunk size.
            dtype_tingest = [("time", "i8"), ("row", "u4"), ("count", "u4")]
            ds_tingest = data_file.create_dataset(DS_TIMEINGEST_PATH,
                                                  (INTERNAL_ROW_INCREMENT, ),
                                                  dtype=dtype_tingest,
                                                  maxshape=(None, ))
            ds_tingest.attrs["cur_row"] = 0
            ds_tingest.attrs["description"] = "Maintains ingest times"

        finally:
            data_file.close()

        return ds_filename, True
Пример #12
0
    def extend_dataset(self, packet):
        """
        Adds values from a data packet to the dataset and updates indexes and metadata
        """
        ingest_ts = NTP4Time.utcnow()
        num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, []
        ds_filename = self._get_ds_filename()
        data_file = HDFLockingFile(ds_filename,
                                   "r+",
                                   retry_count=10,
                                   retry_wait=0.5)
        file_closed = False
        try:
            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                # Get index values from time var
                if self.time_var not in packet.data["cols"]:
                    raise BadRequest("Packet has no time")
                var_ds = data_file["vars/%s" % self.time_var]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                var_ds.attrs["cur_row"] += num_rows

                # Fill variables with values from packet or NaN
                for var_name in self.var_defs_map.keys():
                    var_ds = data_file["vars/%s" % var_name]
                    if cur_idx + num_rows > cur_size:
                        self._resize_dataset(var_ds, num_rows)
                    if var_name in packet.data["cols"]:
                        data_slice = packet.data["data"][:][var_name]
                        var_ds[cur_idx:cur_idx + num_rows] = data_slice
                    else:
                        # Leave the initial fill value (zeros)
                        #var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows
                        pass

                extra_vars = set(packet.data["cols"]) - set(
                    self.var_defs_map.keys())
                if extra_vars:
                    log.warn("Data packet had extra vars not in dataset: %s",
                             extra_vars)

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                var_ds = data_file["vars/%s" % DS_VARIABLES]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                if cur_idx + num_rows > cur_size:
                    self._resize_dataset(var_ds, num_rows)
                ds_var_names = [var_info["name"] for var_info in self.var_defs]
                pvi = {
                    col_name: col_idx
                    for col_idx, col_name in enumerate(packet.data["cols"])
                    if col_name in ds_var_names
                }
                for row_idx in xrange(num_rows):
                    row_data = packet.data["data"][row_idx]
                    row_vals = tuple(row_data[vn] if vn in pvi else None
                                     for vn in ds_var_names)
                    var_ds[cur_idx + row_idx] = row_vals
                var_ds.attrs["cur_row"] += num_rows

            # Update time_ingest (ts, begin row, count)
            ds_tingest = data_file[DS_TIMEINGEST_PATH]
            if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest):
                self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT)
            ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(),
                                                       cur_idx, num_rows)
            ds_tingest.attrs["cur_row"] += 1

            # Update time index
            self._update_time_index(data_file, num_rows, cur_idx=cur_idx)

            # Check if pruning is necessary
            if self.prune_trigger_mode == "on_ingest" and self.prune_mode:
                file_closed = self._prune_dataset(data_file)

            #HDF5Tools.dump_hdf5(data_file, with_data=True)
        except Exception:
            log.exception("Error extending dataset %s HDF5 file" %
                          self.dataset_id)
            raise
        finally:
            if not file_closed:
                data_file.close()
Пример #13
0
    def require_dataset(self):
        """
        Ensures a dataset HDF5 file exists and creates it if necessary usign the dataset
        schema definition.
        """
        ds_filename = self._get_ds_filename()
        if os.path.exists(ds_filename):
            return ds_filename, False

        log.info("Creating new HDF5 file for dataset_id=%s, file='%s'", self.dataset_id, ds_filename)
        dir_path = os.path.split(ds_filename)[0]
        try:
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
        except OSError as exc:
            import errno
            if exc.errno == errno.EEXIST and os.path.isdir(dir_path):
                pass
            else:
                raise

        data_file = HDFLockingFile(ds_filename, "w", retry_count=10, retry_wait=0.5)
        try:
            data_file.attrs["dataset_id"] = self.dataset_id
            data_file.attrs["layout"] = self.ds_layout
            data_file.attrs["format"] = "scion_hdf5_v1"

            data_file.create_group("vars")
            initial_shape = (self.ds_increment, )

            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                for position, var_info in enumerate(self.var_defs):
                    var_name = var_info["name"]
                    base_type = var_info.get("base_type", "float")
                    dtype = var_info.get("storage_dtype", "f8")
                    dset = data_file.create_dataset("vars/%s" % var_name, initial_shape,
                                                    dtype=dtype, maxshape=(None, ))
                    dset.attrs["base_type"] = str(base_type)
                    dset.attrs["position"] = position
                    dset.attrs["description"] = str(var_info.get("description", "") or "")
                    dset.attrs["unit"] = str(var_info.get("unit", "") or "")
                    if var_name == self.time_var:
                        dset.attrs["cur_row"] = 0

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                dtype_parts = []
                for var_info in self.var_defs:
                    var_name = var_info["name"]
                    base_type = var_info.get("base_type", "float")
                    dtype = var_info.get("storage_dtype", "f8")
                    dtype_parts.append((var_name, dtype))

                dset = data_file.create_dataset("vars/%s" % DS_VARIABLES, initial_shape,
                                                dtype=np.dtype(dtype_parts), maxshape=(None, ))
                dset.attrs["dtype_repr"] = repr(dset.dtype)[6:-1]
                dset.attrs["cur_row"] = 0

            # Internal time index
            data_file.create_group("index")
            dtype_tidx = [("time", "i8")]
            ds_tidx = data_file.create_dataset(DS_TIMEIDX_PATH, (INTERNAL_ROW_INCREMENT, ),
                                               dtype=dtype_tidx, maxshape=(None, ))
            ds_tidx.attrs["cur_row"] = 0
            ds_tidx.attrs["description"] = "Index of every %s-th time value" % self.time_idx_step
            ds_tidx.attrs["step"] = self.time_idx_step

            # Internal ingest time
            dtype_tingest = [("time", "i8"), ("row", "u4"), ("count", "u4")]
            ds_tingest = data_file.create_dataset(DS_TIMEINGEST_PATH, (INTERNAL_ROW_INCREMENT, ),
                                                  dtype=dtype_tingest, maxshape=(None, ))
            ds_tingest.attrs["cur_row"] = 0
            ds_tingest.attrs["description"] = "Maintains ingest times"

        finally:
            data_file.close()

        return ds_filename, True
Пример #14
0
    def test_hdf5_persist(self):
        # Test HDF5 writing, time indexing, array extension etc
        ds_schema_str = """
        type: scion_data_schema_1
        description: Schema for test datasets
        attributes:
          basic_shape: 1d_timeseries
          time_variable: time
          persistence:
            format: hdf5
            layout: vars_individual
            row_increment: 1000
            time_index_step: 1000
        variables:
          - name: time
            base_type: ntp_time
            storage_dtype: i8
            unit: ""
            description: NTPv4 timestamp
          - name: var1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Sample value
          - name: random1
            base_type: float
            storage_dtype: f8
            unit: ""
            description: Random values
        """
        ds_schema = yaml.load(ds_schema_str)
        ds_id = create_simple_unique_id()
        ds_filename = self.container.file_system.get(
            "%s/%s%s.hdf5" % (DS_BASE_PATH, DS_FILE_PREFIX, ds_id))

        self.hdf5_persist = DatasetHDF5Persistence.get_persistence(
            ds_id, ds_schema, "hdf5")
        self.hdf5_persist.require_dataset()

        self.assertTrue(os.path.exists(ds_filename))
        self.addCleanup(os.remove, ds_filename)

        # Add 100 values in packets of 10
        for i in xrange(10):
            packet = self._get_data_packet(i * 10, 10)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res), 3)
        self.assertEqual(len(data_res["time"]), 100)
        self.assertEqual(len(data_res["var1"]), 100)
        self.assertEqual(len(data_res["random1"]), 100)
        self.assertEqual(data_res["var1"][1], 1.0)

        with HDFLockingFile(ds_filename, "r") as hdff:
            ds_time = hdff["vars/time"]
            cur_idx = ds_time.attrs["cur_row"]
            self.assertEqual(cur_idx, 100)
            self.assertEqual(len(ds_time), 1000)

            ds_tidx = hdff[DS_TIMEIDX_PATH]
            cur_tidx = ds_tidx.attrs["cur_row"]
            self.assertEqual(cur_tidx, 1)
            self.assertEqual(len(ds_tidx), 1000)

        # Add 1000 values in packets of 10
        for i in xrange(100):
            packet = self._get_data_packet(100 + i * 10, 10)
            self.hdf5_persist.extend_dataset(packet)

        data_res = self.hdf5_persist.get_data()
        self.assertEqual(len(data_res["time"]), 1100)

        with HDFLockingFile(ds_filename, "r") as hdff:
            ds_time = hdff["vars/time"]
            cur_idx = ds_time.attrs["cur_row"]
            self.assertEqual(cur_idx, 1100)
            self.assertEqual(len(ds_time), 2000)

            ds_tidx = hdff[DS_TIMEIDX_PATH]
            cur_tidx = ds_tidx.attrs["cur_row"]
            self.assertEqual(cur_tidx, 2)
            self.assertEqual(len(ds_tidx), 1000)

            self.assertEqual(ds_time[0], ds_tidx[0][0])
            self.assertEqual(ds_time[1000], ds_tidx[1][0])

        info_res = self.hdf5_persist.get_data_info()

        self.assertEqual(info_res["ds_rows"], 1100)
        self.assertEqual(info_res["ts_first"], 1000000000.0)
        self.assertEqual(info_res["ts_last"], 1000010990.0)
Пример #15
0
    def require_dataset(self, ds_filename=None):
        """
        Ensures a dataset HDF5 file exists and creates it if necessary usign the dataset
        schema definition.
        """
        ds_filename = ds_filename or self._get_ds_filename()
        if os.path.exists(ds_filename):
            return ds_filename, False

        log.info("Creating new HDF5 file for dataset_id=%s, file='%s'", self.dataset_id, ds_filename)
        dir_path = os.path.split(ds_filename)[0]
        try:
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
        except OSError as exc:
            import errno

            if exc.errno == errno.EEXIST and os.path.isdir(dir_path):
                pass
            else:
                raise

        data_file = HDFLockingFile(ds_filename, "w", retry_count=10, retry_wait=0.5)
        try:
            data_file.attrs["dataset_id"] = self.dataset_id
            data_file.attrs["layout"] = self.ds_layout
            data_file.attrs["format"] = "scion_hdf5_v1"

            data_file.create_group("vars")
            initial_shape = (self.ds_increment,)

            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                # Individial layout means every variable has its own table - variable values
                # must be coindexed with the time values.
                # The time variable keeps the cur_row attribute which is the next writable index.
                # The length of tables is increased in configurable chunk sizes.
                for position, var_info in enumerate(self.var_defs):
                    var_name = var_info["name"]
                    base_type = var_info.get("base_type", "float")
                    dtype = var_info.get("storage_dtype", "f8")
                    dset = data_file.create_dataset("vars/%s" % var_name, initial_shape, dtype=dtype, maxshape=(None,))
                    dset.attrs["base_type"] = str(base_type)
                    dset.attrs["position"] = position
                    dset.attrs["description"] = str(var_info.get("description", "") or "")
                    dset.attrs["unit"] = str(var_info.get("unit", "") or "")
                    if var_name == self.time_var:
                        dset.attrs["cur_row"] = 0

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                # EXPERIMENTAL - unsupported for most operations.
                # Combined layout means all variables are in one table of structured type.
                # The cur_row attribute keeps the number of next writable index.
                # The length of the table is increased in configurable chunk sizes.
                dtype_parts = []
                for var_info in self.var_defs:
                    var_name = var_info["name"]
                    base_type = var_info.get("base_type", "float")
                    dtype = var_info.get("storage_dtype", "f8")
                    dtype_parts.append((var_name, dtype))

                dset = data_file.create_dataset(
                    "vars/%s" % DS_VARIABLES, initial_shape, dtype=np.dtype(dtype_parts), maxshape=(None,)
                )
                dset.attrs["dtype_repr"] = repr(dset.dtype)[6:-1]
                dset.attrs["cur_row"] = 0

            # Internal time index - a table indexing every nth row's timestep.
            # Index table grows in constant defined chunk size.
            data_file.create_group("index")
            dtype_tidx = [("time", "i8")]
            ds_tidx = data_file.create_dataset(
                DS_TIMEIDX_PATH, (INTERNAL_ROW_INCREMENT,), dtype=dtype_tidx, maxshape=(None,)
            )
            ds_tidx.attrs["cur_row"] = 0
            ds_tidx.attrs["description"] = "Index of every %s-th time value" % self.time_idx_step
            ds_tidx.attrs["step"] = self.time_idx_step

            # Internal ingest time - table of 3 tuple with timestamp, start row and num rows for a packet
            # Index table grows in constant defined chunk size.
            dtype_tingest = [("time", "i8"), ("row", "u4"), ("count", "u4")]
            ds_tingest = data_file.create_dataset(
                DS_TIMEINGEST_PATH, (INTERNAL_ROW_INCREMENT,), dtype=dtype_tingest, maxshape=(None,)
            )
            ds_tingest.attrs["cur_row"] = 0
            ds_tingest.attrs["description"] = "Maintains ingest times"

        finally:
            data_file.close()

        return ds_filename, True
Пример #16
0
    def extend_dataset(self, packet):
        """
        Adds values from a data packet to the dataset and updates indexes and metadata
        """
        ingest_ts = NTP4Time.utcnow()
        num_rows, cur_idx, time_idx_rows = len(packet.data["data"]), 0, []
        ds_filename = self._get_ds_filename()
        data_file = HDFLockingFile(ds_filename, "r+", retry_count=10, retry_wait=0.5)
        file_closed = False
        try:
            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                # Get index values from time var
                if self.time_var not in packet.data["cols"]:
                    raise BadRequest("Packet has no time")
                var_ds = data_file["vars/%s" % self.time_var]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                var_ds.attrs["cur_row"] += num_rows

                # Fill variables with values from packet or NaN
                for var_name in self.var_defs_map.keys():
                    var_ds = data_file["vars/%s" % var_name]
                    if cur_idx + num_rows > cur_size:
                        self._resize_dataset(var_ds, num_rows)
                    if var_name in packet.data["cols"]:
                        data_slice = packet.data["data"][:][var_name]
                        var_ds[cur_idx : cur_idx + num_rows] = data_slice
                    else:
                        # Leave the initial fill value (zeros)
                        # var_ds[cur_idx:cur_idx+num_rows] = [None]*num_rows
                        pass

                extra_vars = set(packet.data["cols"]) - set(self.var_defs_map.keys())
                if extra_vars:
                    log.warn("Data packet had extra vars not in dataset: %s", extra_vars)

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                var_ds = data_file["vars/%s" % DS_VARIABLES]
                cur_size, cur_idx = len(var_ds), var_ds.attrs["cur_row"]
                if cur_idx + num_rows > cur_size:
                    self._resize_dataset(var_ds, num_rows)
                ds_var_names = [var_info["name"] for var_info in self.var_defs]
                pvi = {
                    col_name: col_idx
                    for col_idx, col_name in enumerate(packet.data["cols"])
                    if col_name in ds_var_names
                }
                for row_idx in xrange(num_rows):
                    row_data = packet.data["data"][row_idx]
                    row_vals = tuple(row_data[vn] if vn in pvi else None for vn in ds_var_names)
                    var_ds[cur_idx + row_idx] = row_vals
                var_ds.attrs["cur_row"] += num_rows

            # Update time_ingest (ts, begin row, count)
            ds_tingest = data_file[DS_TIMEINGEST_PATH]
            if ds_tingest.attrs["cur_row"] + 1 > len(ds_tingest):
                self._resize_dataset(ds_tingest, 1, INTERNAL_ROW_INCREMENT)
            ds_tingest[ds_tingest.attrs["cur_row"]] = (ingest_ts.to_np_value(), cur_idx, num_rows)
            ds_tingest.attrs["cur_row"] += 1

            # Update time index
            self._update_time_index(data_file, num_rows, cur_idx=cur_idx)

            # Check if pruning is necessary
            if self.prune_trigger_mode == "on_ingest" and self.prune_mode:
                file_closed = self._prune_dataset(data_file)

            # HDF5Tools.dump_hdf5(data_file, with_data=True)
        except Exception:
            log.exception("Error extending dataset %s HDF5 file" % self.dataset_id)
            raise
        finally:
            if not file_closed:
                data_file.close()
Пример #17
0
    def _get_data_copy(self, data_file, data_filter=None):
        """ Helper to copy HDF5 that takes already open file handle """
        data_filter = data_filter or {}
        res_data = {}
        read_vars = data_filter.get("variables", []) or [
            var_info["name"] for var_info in self.var_defs
        ]
        start_time = data_filter.get("start_time", None)
        end_time = data_filter.get("end_time", None)
        start_time_include = data_filter.get("start_time_include",
                                             True) is True
        time_slice = None

        ds_time = data_file["vars/%s" % self.time_var]
        cur_idx = ds_time.attrs["cur_row"]

        start_row, end_row = self._get_row_interval(data_file, start_time,
                                                    end_time,
                                                    start_time_include)
        num_rows = end_row - start_row
        log.info("Copying dataset: %s rows of %s (%s to %s)",
                 end_row - start_row, cur_idx, start_row, end_row)

        if self.ds_layout != DS_LAYOUT_INDIVIDUAL:
            raise NotImplementedError()

        copy_filename = self.container.file_system.get("TEMP/ds_temp_%s.hdf5" %
                                                       uuid.uuid4().hex)
        try:
            self.require_dataset(ds_filename=copy_filename)

            new_file = HDFLockingFile(copy_filename,
                                      "r+",
                                      retry_count=2,
                                      retry_wait=0.1)
            try:
                for var_name in read_vars:
                    ds_path = "vars/%s" % var_name
                    if ds_path not in data_file:
                        log.warn("Variable '%s' not in dataset - ignored",
                                 var_name)
                        continue
                    ds_var = data_file[ds_path]
                    new_ds_var = new_file[ds_path]

                    if num_rows > len(new_ds_var):
                        self._resize_dataset(new_ds_var, num_rows)

                    data_array = ds_var[start_row:end_row]
                    # TODO: Chunkwise copy instead of one big
                    new_ds_var[0:num_rows] = data_array
                    if var_name == self.time_var:
                        new_ds_var.attrs["cur_row"] = num_rows

                    # Use lower level copy
                # Time index
                self._update_time_index(new_file, num_rows, cur_idx=0)

                # Ingest ts - copy from existing, fix index values and prune
                ds_tingest = data_file[DS_TIMEINGEST_PATH]
                new_ds_tingest = new_file[DS_TIMEINGEST_PATH]
                self._resize_dataset(
                    new_ds_tingest, len(ds_tingest),
                    INTERNAL_ROW_INCREMENT)  # Could be smaller
                prev_irow_val, cur_new_row = None, 0
                for irow, irow_val in enumerate(
                        ds_tingest[:ds_tingest.attrs["cur_row"]]):
                    its, iidx, inrows = irow_val
                    if iidx >= start_row:
                        if iidx > start_row and cur_new_row == 0 and prev_irow_val:
                            # The first is partial of previous row
                            pits, piidx, pinrows = prev_irow_val
                            new_ds_tingest[cur_new_row] = (pits, 0,
                                                           iidx - start_row)
                            cur_new_row += 1
                        if iidx + inrows - 1 < end_row:
                            new_ds_tingest[cur_new_row] = (its,
                                                           iidx - start_row,
                                                           min(
                                                               inrows,
                                                               end_row - iidx))
                            cur_new_row += 1
                    prev_irow_val = irow_val
                new_ds_tingest.attrs["cur_row"] = cur_new_row

            finally:
                try:
                    new_file.close()
                except Exception:
                    pass

            log.info(
                "Copy dataset successful: %s rows, %s bytes (original size: %s bytes)",
                num_rows, os.path.getsize(copy_filename),
                os.path.getsize(self._get_ds_filename()))

            #HDF5Tools.dump_hdf5(copy_filename, with_data=True)
        except Exception:
            log.exception("Error copying HDF5 file")
            if os.path.exists(copy_filename):
                os.remove(copy_filename)
            return None

        return copy_filename
Пример #18
0
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows_org = max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            should_decimate = data_filter.get("decimate", False) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            log.info("Get data for row interval %s to %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion
            if end_row - start_row > max_rows:
                if should_decimate:
                    log.info("Decimating %s rows to satisfy %s max rows", end_row - start_row, max_rows_org)
                else:
                    log.info(
                        "Truncating %s rows to %s max rows, %s unexpanded", end_row - start_row, max_rows_org, max_rows
                    )

            if self.ds_layout != DS_LAYOUT_INDIVIDUAL:
                raise NotImplementedError()

            ds_time = data_file["vars/%s" % self.time_var]
            cur_idx = ds_time.attrs["cur_row"]
            for var_name in read_vars:
                ds_path = "vars/%s" % var_name
                if ds_path not in data_file:
                    log.warn("Variable '%s' not in dataset - ignored", var_name)
                    continue
                ds_var = data_file[ds_path]
                start_row_act = start_row if should_decimate else max(start_row, end_row - max_rows, 0)
                data_array = ds_var[start_row_act:end_row]
                if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time":
                    if time_format == "unix_millis":
                        data_array = [int(1000 * NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array]
                    else:
                        data_array = data_array.tolist()
                else:
                    data_array = data_array.tolist()
                if var_name == self.time_var:
                    time_slice = data_array

                res_data[var_name] = data_array

            # At this point res_data is dict mapping varname to data array. Time values are target (unix) timestamps
            self._expand_packed_rows(res_data, data_filter)

            self._decimate_rows(res_data, data_filter)

            if data_filter.get("transpose_time", False) is True:
                time_series = res_data.pop(self.time_var)
                for var_name, var_series in res_data.iteritems():
                    res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)]

            return res_data

        finally:
            data_file.close()
Пример #19
0
    def _get_data_copy(self, data_file, data_filter=None):
        """ Helper to copy HDF5 that takes already open file handle """
        data_filter = data_filter or {}
        res_data = {}
        read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
        start_time = data_filter.get("start_time", None)
        end_time = data_filter.get("end_time", None)
        start_time_include = data_filter.get("start_time_include", True) is True
        time_slice = None

        ds_time = data_file["vars/%s" % self.time_var]
        cur_idx = ds_time.attrs["cur_row"]

        start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
        num_rows = end_row - start_row
        log.info("Copying dataset: %s rows of %s (%s to %s)", end_row - start_row, cur_idx, start_row, end_row)

        if self.ds_layout != DS_LAYOUT_INDIVIDUAL:
            raise NotImplementedError()

        copy_filename = self.container.file_system.get("TEMP/ds_temp_%s.hdf5" % uuid.uuid4().hex)
        try:
            self.require_dataset(ds_filename=copy_filename)

            new_file = HDFLockingFile(copy_filename, "r+", retry_count=2, retry_wait=0.1)
            try:
                for var_name in read_vars:
                    ds_path = "vars/%s" % var_name
                    if ds_path not in data_file:
                        log.warn("Variable '%s' not in dataset - ignored", var_name)
                        continue
                    ds_var = data_file[ds_path]
                    new_ds_var = new_file[ds_path]

                    if num_rows > len(new_ds_var):
                        self._resize_dataset(new_ds_var, num_rows)

                    data_array = ds_var[start_row:end_row]
                    # TODO: Chunkwise copy instead of one big
                    new_ds_var[0:num_rows] = data_array
                    if var_name == self.time_var:
                        new_ds_var.attrs["cur_row"] = num_rows

                    # Use lower level copy
                # Time index
                self._update_time_index(new_file, num_rows, cur_idx=0)

                # Ingest ts - copy from existing, fix index values and prune
                ds_tingest = data_file[DS_TIMEINGEST_PATH]
                new_ds_tingest = new_file[DS_TIMEINGEST_PATH]
                self._resize_dataset(new_ds_tingest, len(ds_tingest), INTERNAL_ROW_INCREMENT)  # Could be smaller
                prev_irow_val, cur_new_row = None, 0
                for irow, irow_val in enumerate(ds_tingest[: ds_tingest.attrs["cur_row"]]):
                    its, iidx, inrows = irow_val
                    if iidx >= start_row:
                        if iidx > start_row and cur_new_row == 0 and prev_irow_val:
                            # The first is partial of previous row
                            pits, piidx, pinrows = prev_irow_val
                            new_ds_tingest[cur_new_row] = (pits, 0, iidx - start_row)
                            cur_new_row += 1
                        if iidx + inrows - 1 < end_row:
                            new_ds_tingest[cur_new_row] = (its, iidx - start_row, min(inrows, end_row - iidx))
                            cur_new_row += 1
                    prev_irow_val = irow_val
                new_ds_tingest.attrs["cur_row"] = cur_new_row

            finally:
                try:
                    new_file.close()
                except Exception:
                    pass

            log.info(
                "Copy dataset successful: %s rows, %s bytes (original size: %s bytes)",
                num_rows,
                os.path.getsize(copy_filename),
                os.path.getsize(self._get_ds_filename()),
            )

            # HDF5Tools.dump_hdf5(copy_filename, with_data=True)
        except Exception:
            log.exception("Error copying HDF5 file")
            if os.path.exists(copy_filename):
                os.remove(copy_filename)
            return None

        return copy_filename
Пример #20
0
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows = data_filter.get("max_rows", 999999999)
            time_slice = None
            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                time_ds = data_file["vars/%s" % self.time_var]
                cur_idx = time_ds.attrs["cur_row"]
                for var_name in read_vars:
                    ds_path = "vars/%s" % var_name
                    if ds_path not in data_file:
                        log.warn("Variable '%s' not in dataset - ignored", var_name)
                        continue
                    var_ds = data_file[ds_path]
                    data_array = var_ds[max(cur_idx-max_rows, 0):cur_idx]
                    if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time":
                        if time_format == "unix_millis":
                            data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array]
                        else:
                            data_array = data_array.tolist()
                    else:
                        data_array = data_array.tolist()
                    if var_name == self.time_var:
                        time_slice = data_array

                    res_data[var_name] = data_array

                if data_filter.get("transpose_time", False) is True:
                    time_series = res_data.pop(self.time_var)
                    for var_name, var_series in res_data.iteritems():
                        res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)]

                # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                raise NotImplementedError()

            start_time = data_filter.get("start_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            if time_slice and res_data and start_time:
                start_time = int(start_time)
                time_idx = len(time_slice)
                for idx, tv in enumerate(time_slice):
                    if tv == start_time and start_time_include:
                        time_idx = idx
                        break
                    elif tv > start_time:
                        time_idx = idx
                        break
                for var_name, var_series in res_data.iteritems():
                    res_data[var_name] = var_series[time_idx:]

            return res_data

        finally:
            data_file.close()
    def get_data(self, data_filter=None):
        data_filter = data_filter or {}
        ds_filename = self._get_ds_filename()
        if not os.path.exists(ds_filename):
            return {}
        data_file = HDFLockingFile(ds_filename, "r", retry_count=10, retry_wait=0.2)
        try:
            res_data = {}
            read_vars = data_filter.get("variables", []) or [var_info["name"] for var_info in self.var_defs]
            time_format = data_filter.get("time_format", "unix_millis")
            max_rows = data_filter.get("max_rows", DEFAULT_MAX_ROWS)
            start_time = data_filter.get("start_time", None)
            end_time = data_filter.get("end_time", None)
            start_time_include = data_filter.get("start_time_include", True) is True
            should_decimate = data_filter.get("decimate", False) is True
            time_slice = None

            start_row, end_row = self._get_row_interval(data_file, start_time, end_time, start_time_include)
            log.info("Row date interval: %s : %s", start_row, end_row)
            if self.expand_info.get("need_expand", False):
                max_rows = max_rows / self.expand_info["num_steps"]  # Compensate expansion
            if end_row-start_row > max_rows:
                log.info("Truncating %s rows to %s max rows (from the end)", end_row-start_row, max_rows)

            if self.ds_layout == DS_LAYOUT_INDIVIDUAL:
                ds_time = data_file["vars/%s" % self.time_var]
                cur_idx = ds_time.attrs["cur_row"]
                for var_name in read_vars:
                    ds_path = "vars/%s" % var_name
                    if ds_path not in data_file:
                        log.warn("Variable '%s' not in dataset - ignored", var_name)
                        continue
                    ds_var = data_file[ds_path]
                    data_array = ds_var[max(start_row, end_row-max_rows, 0):end_row]
                    if var_name == self.time_var and self.var_defs_map[var_name].get("base_type", "") == "ntp_time":
                        if time_format == "unix_millis":
                            data_array = [int(1000*NTP4Time.from_ntp64(dv.tostring()).to_unix()) for dv in data_array]
                        else:
                            data_array = data_array.tolist()
                    else:
                        data_array = data_array.tolist()
                    if var_name == self.time_var:
                        time_slice = data_array

                    res_data[var_name] = data_array

                # At this point we have dict with variable to data array mapping with target (unix) timestamps
                self._expand_packed_rows(res_data, data_filter)

                if data_filter.get("transpose_time", False) is True:
                    time_series = res_data.pop(self.time_var)
                    for var_name, var_series in res_data.iteritems():
                        res_data[var_name] = [(tv, dv) for (tv, dv) in zip(time_series, var_series)]

                # Downsample: http://stackoverflow.com/questions/20322079/downsample-a-1d-numpy-array

            elif self.ds_layout == DS_LAYOUT_COMBINED:
                raise NotImplementedError()

            return res_data

        finally:
            data_file.close()