def write_to_hdf5(cache_handler: h5py.File,
                  embed_stream: Generator[Tuple[List[str], np.ndarray], None,
                                          None]):
    for key, array_data in tqdm(embed_stream):
        group_key = '/'.join(key[:-1])
        dataset_key = key[-1]
        group_obj = cache_handler.require_group(group_key)
        group_obj.create_dataset(name=dataset_key,
                                 data=array_data,
                                 dtype=np.float32)
Пример #2
0
 def dumpToHdf5(
     self, h5_file: h5py.File, inner_path: str, progress_signal: Callable[[int], None] = lambda x: None
 ) -> str:
     progress_signal(0)
     try:
         h5_file.require_group(Path("/").joinpath(inner_path).parent.as_posix())
         graph = Graph()
         op_writer = OpH5N5WriterBigDataset(
             graph=graph,
             h5N5File=h5_file,
             h5N5Path=inner_path,
             CompressionEnabled=False,
             BatchSize=1,
             Image=self.get_provider_slot(graph=graph),
         )
         op_writer.progressSignal.subscribe(progress_signal)
         success = op_writer.WriteImage.value  # reading this slot triggers the write
     finally:
         progress_signal(100)
Пример #3
0
def write_optimization_options(f: h5py.File) -> None:
    """
    Create groups and write some default optimization settings
    """

    # set common options
    g = f.require_group('optimizationOptions')
    g.attrs['optimizer'] = 0  # IpOpt
    g.attrs['retryOptimization'] = 1
    g.attrs['hierarchicalOptimization'] = 1
    g.attrs['numStarts'] = 1

    # set IpOpt options
    g = f.require_group('optimizationOptions/ipopt')
    g.attrs['max_iter'] = 100
    g.attrs['hessian_approximation'] = np.string_("limited-memory")
    g.attrs["limited_memory_update_type"] = np.string_("bfgs")
    g.attrs["tol"] = 1e-9
    g.attrs["acceptable_iter"] = 1
    # set ridiculously high, so only the acceptable_* options below matter
    g.attrs["acceptable_tol"] = 1e20
    g.attrs["acceptable_obj_change_tol"] = 1e-12
    g.attrs["watchdog_shortened_iter_trigger"] = 0

    # set fmincon options
    g = f.require_group('optimizationOptions/fmincon')
    g.attrs['MaxIter'] = 100
    g.attrs["TolX"] = 1e-8
    g.attrs["TolFun"] = 0
    g.attrs["MaxFunEvals"] = 1e7
    g.attrs["algorithm"] = np.string_("interior-point")
    g.attrs["GradObj"] = np.string_("on")
    g.attrs["display"] = np.string_("iter")

    # set CERES options
    g = f.require_group('optimizationOptions/ceres')
    g.attrs['max_num_iterations'] = 100

    # set toms611/SUMSL options
    g = f.require_group('optimizationOptions/toms611')
    g.attrs['mxfcal'] = 1e8
Пример #4
0
def write_parameters(file: h5py.File, rows: int, epoch: int,
                     parameters: Sequence[Mapping[str, ParameterSet]]):
    param_group = file.require_group(Group.PARAMETERS)
    for layer in range(len(parameters)):
        for params in parameters[layer].values():
            get_dataset(param_group.require_group(params.name), "values", rows,
                        np.shape(params.values))[epoch] = params.values

            get_dataset(param_group.require_group(params.name), "gradients", rows,
                        np.shape(params.gradients))[epoch] = params.gradients

            delta_values = np.reshape([delta.value for delta in params.deltas.flatten()],
                                      params.deltas.shape)
            get_dataset(param_group.require_group(params.name), "delta_values", rows,
                        np.shape(delta_values))[epoch] = delta_values
Пример #5
0
    def _save_hdf5(self, file: h5py.File):
        """
        Actual implementation of HDF5 saving.

        Args:
            file: The open h5py.File to write the skeleton data to.

        Returns:
            None
        """

        # All skeleton will be put as sub-groups in the skeleton group
        if "skeleton" not in file:
            all_sk_group = file.create_group("skeleton", track_order=True)
        else:
            all_sk_group = file.require_group("skeleton")

        # Write the dataset to JSON string, then store it in a string
        # attribute
        all_sk_group.attrs[self.name] = np.string_(self.to_json())
Пример #6
0
class DataExplorer:
    """Navigate datafile created through aggregation or processing.

    Valid datafile are those created by DataClassifier.consolidate_data and
    DataProcessor.run_process

    In those files the top-level refers either to:
    - a kind of measurement (DataClassifier.consolidate_data)
    - a tier of analysis (DataProcessor.run_process)

    """

    #: Path to the file to open.
    path: str

    #: Should the file be open such as to allow to edit it.
    allow_edits: bool = False

    #: Should a brand new file be created.
    create_new: bool = False

    def open(self) -> None:
        """Open the underlying HDF5 file."""
        mode = "w" if self.create_new else ("r+" if self.allow_edits else "r")
        self._file = File(self.path, mode)

    def close(self) -> None:
        """Close the underlying HDF5 file."""
        if self._file:
            self._file.close()
        self._file = None

    def __enter__(self) -> "DataExplorer":
        """Open the underlying HDF5 file when used as a context manager."""
        self.open()
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        """Close the underlying HDF5 file when used as a context manager."""
        self.close()

    def list_top_level(self) -> List[str]:
        """List the top level groups (measurement or tier)."""
        if not self._file:
            raise RuntimeError("No opened datafile")
        return list(self._file.keys())

    def list_classifiers(self, measurement) -> Dict[int, List[str]]:
        """List the classifiers name by level."""
        if not self._file:
            raise RuntimeError("No opened datafile")
        if measurement not in self._file:
            raise ValueError(
                f"No measurement {measurement} in opened datafile, "
                f"existing measurements are {self.list_top_level()}")

        def extract_classifiers(group: Group, classifiers: Dict[int,
                                                                List[str]],
                                level: int) -> Dict[int, List[str]]:
            # By construction the classifiers are the same on each level
            # so we only visit one level of each
            for entry in group.values():
                if isinstance(entry, Group):
                    classifiers[level] = list(entry.attrs)
                    extract_classifiers(entry, classifiers, level + 1)
                    break
            return classifiers

        return extract_classifiers(self._file[measurement], dict(), 0)

    def walk_data(
            self, measurement: str
    ) -> Iterator[Tuple[Dict[int, Dict[str, Any]], Group]]:
        """Iterate over all the data found under one top level entry.

        This function provides the classifiers and the group containing the
        datasets of interest.

        """
        # Maximal depth of classifiers
        max_depth = len(self.list_classifiers(measurement))

        def yield_classifier_and_data(
            group: Group, depth: int, classifiers: Dict[int, Dict[str, Any]]
        ) -> Iterator[Tuple[Dict[int, Dict[str, Any]], Group]]:
            # If the group has any dataset yield it and then keep going
            # This is relevant for processed data merged from different measurements
            if any(isinstance(k, Dataset) for k in group.values()):
                yield classifiers, group
            if depth == max_depth - 1:
                for g in [g for g in group.values() if isinstance(g, Group)]:
                    clfs = classifiers.copy()
                    clfs[depth] = dict(g.attrs)
                    yield clfs, g
            else:
                for g in group.values():
                    clfs = classifiers.copy()
                    clfs[depth] = dict(g.attrs)
                    yield from yield_classifier_and_data(g, depth + 1, clfs)

        yield from yield_classifier_and_data(self._file[measurement], 0,
                                             dict())

    def get_data(self, toplevel: str,
                 classifiers: Dict[int, Dict[str, Any]]) -> Group:
        """Retrieve the group containing the datasets corresponding to the classifiers.

        """
        known = self.list_classifiers(toplevel)
        if not {k: list(v) for k, v in classifiers.items()} == known:
            raise ValueError(f"Unknown classifiers used ({classifiers}),"
                             f" known classifiers are {known}")

        group = self._file[toplevel]
        for level, values in classifiers.items():
            key = make_group_name(values)
            if key not in group:
                raise ValueError(
                    f"No entry of level {level} found for {values}, "
                    f"at this level known entries are {[dict(g.attrs) for g in group]}."
                )
            group = group[key]

        return group

    def require_group(self, toplevel: str,
                      classifiers: Dict[int, Dict[str, Any]]) -> Group:
        """Access the group matching the toplevel and classifiers.

        If any group does not exist it is created.

        """
        # Ensure the top group is present
        group = self._file.require_group(toplevel)

        # At each classifier level check if the group exist, create it if necessary
        for level, values in classifiers.items():
            key = make_group_name(values)
            if key not in group:
                group = group.create_group(key)
                group.attrs.update(values)
            else:
                group = group[key]

        return group
Пример #7
0
class HDF5Recorder(BaseRecorder):
    """
    A recorder that stores data using HDF5. This format naturally handles
    hierarchical data and is a standard for handling large datasets.

    Args
    ----
    out : str
        String containing the filename for the HDF5 file.

    **driver_kwargs
        Additional keyword args to be passed to the HDF5 driver.

    Options
    -------
    options['record_metadata'] :  bool(True)
        Tells recorder whether to record variable attribute metadata.
    options['record_unknowns'] :  bool(True)
        Tells recorder whether to record the unknowns vector.
    options['record_params'] :  bool(False)
        Tells recorder whether to record the params vector.
    options['record_resids'] :  bool(False)
        Tells recorder whether to record the ressiduals vector.
    options['includes'] :  list of strings
        Patterns for variables to include in recording.
    options['excludes'] :  list of strings
        Patterns for variables to exclude in recording (processed after includes).
    """

    def __init__(self, out, **driver_kwargs):

        super(HDF5Recorder, self).__init__()
        self.out = File(out, "w", **driver_kwargs)

        metadata_group = self.out.require_group("metadata")

        metadata_group.create_dataset("format_version", data=format_version)

    def record_metadata(self, group):
        """Stores the metadata of the given group in a HDF5 file using
        the variable name for the key.

        Args
        ----
        group : `System`
            `System` containing vectors
        """
        params = group.params.iteritems()
        resids = group.resids.iteritems()
        unknowns = group.unknowns.iteritems()

        metadata_group = self.out["metadata"]

        # The group metadata could be anything so need to pickle it
        # There are other ways of storing any kind of Python object in HDF5 but this is the simplest
        system_metadata_val = np.array(pickle.dumps(group.metadata, pickle.HIGHEST_PROTOCOL))
        metadata_group.create_dataset("system_metadata", data=system_metadata_val)

        # Also store the model_viewer_data
        model_viewer_data = get_model_viewer_data(group)
        model_viewer_data_val = np.array(pickle.dumps(model_viewer_data, pickle.HIGHEST_PROTOCOL))
        metadata_group.create_dataset("model_viewer_data", data=model_viewer_data_val)

        pairings = (
            (metadata_group.create_group("Parameters"), params),
            (metadata_group.create_group("Unknowns"), unknowns),
        )

        for grp, data in pairings:
            for key, val in data:
                meta_group = grp.create_group(key)

                for mkey, mval in iteritems(val):
                    meta_group.create_dataset(mkey, data=mval)
                    # if isinstance(val, (np.ndarray, Number)):
                    #    grp.create_dataset(key, data=val)
                    #    # TODO: Compression/Checksum?
                    # else:
                    #    # TODO: Handling non-numeric data
                    #    msg = "HDF5 Recorder does not support data of type '{0}'".format(type(val))
                    #    raise NotImplementedError(msg)

    def record_iteration(self, params, unknowns, resids, metadata):
        """
        Stores the provided data in the HDF5 file using the iteration
        coordinate for the Group name.

        Args
        ----
        params : dict
            Dictionary containing parameters. (p)

        unknowns : dict
            Dictionary containing outputs and states. (u)

        resids : dict
            Dictionary containing residuals. (r)

        metadata : dict, optional
            Dictionary containing execution metadata (e.g. iteration coordinate).
        """

        iteration_coordinate = metadata["coord"]
        group_name = format_iteration_coordinate(iteration_coordinate)

        f = self.out

        group = f.require_group(group_name)
        group.attrs["timestamp"] = metadata["timestamp"]
        group.attrs["success"] = metadata["success"]
        group.attrs["msg"] = metadata["msg"]

        pairings = []

        if self.options["record_params"]:
            p_group = group.create_group("Parameters")
            pairings.append((p_group, self._filter_vector(params, "p", iteration_coordinate)))

        if self.options["record_unknowns"]:
            u_group = group.create_group("Unknowns")
            pairings.append((u_group, self._filter_vector(unknowns, "u", iteration_coordinate)))

        if self.options["record_resids"]:
            r_group = group.create_group("Residuals")
            pairings.append((r_group, self._filter_vector(resids, "r", iteration_coordinate)))

        for grp, data in pairings:
            for key, val in iteritems(data):
                if isinstance(val, (np.ndarray, Number)):
                    grp.create_dataset(key, data=val)
                    # TODO: Compression/Checksum?
                else:
                    # TODO: Handling non-numeric data
                    msg = "HDF5 Recorder does not support data of type '{0}'".format(type(val))
                    raise NotImplementedError(msg)

    def record_derivatives(self, derivs, metadata):
        """Writes the derivatives that were calculated for the driver.

        Args
        ----
        derivs : dict
            Dictionary containing derivatives

        metadata : dict, optional
            Dictionary containing execution metadata (e.g. iteration coordinate).
        """

        iteration_coordinate = metadata["coord"]
        group_name = format_iteration_coordinate(iteration_coordinate)

        # get the group for the iteration
        iteration_group = self.out[group_name]

        # Create a group under that called 'deriv'
        deriv_group = iteration_group.require_group("Derivs")

        # Then add timestamp, success, msg as attributes
        deriv_group.attrs["timestamp"] = metadata["timestamp"]
        deriv_group.attrs["success"] = metadata["success"]
        deriv_group.attrs["msg"] = metadata["msg"]

        #  And actual deriv data. derivs could either be a dict or an ndarray
        #    depending on the optimizer
        if isinstance(derivs, np.ndarray):
            deriv_group.create_dataset("Derivatives", data=derivs)
        elif isinstance(derivs, OrderedDict):
            deriv_data_group = deriv_group.require_group("Derivatives")
            k = derivs.keys()
            for k, v in derivs.items():
                g = deriv_data_group.require_group(k)
                for k2, v2 in v.items():
                    g.create_dataset(k2, data=v2)
        else:
            raise ValueError("Currently can only record derivatives that are ndarrays or OrderedDicts")
Пример #8
0
class HDF5Recorder(BaseRecorder):
    """
    A recorder that stores data using HDF5. This format naturally handles
    hierarchical data and is a standard for handling large datasets.

    Args
    ----
    out : str
        String containing the filename for the HDF5 file.

    **driver_kwargs
        Additional keyword args to be passed to the HDF5 driver.

    Options
    -------
    options['record_metadata'] :  bool(True)
        Tells recorder whether to record variable attribute metadata.
    options['record_unknowns'] :  bool(True)
        Tells recorder whether to record the unknowns vector.
    options['record_params'] :  bool(False)
        Tells recorder whether to record the params vector.
    options['record_resids'] :  bool(False)
        Tells recorder whether to record the ressiduals vector.
    options['includes'] :  list of strings
        Patterns for variables to include in recording.
    options['excludes'] :  list of strings
        Patterns for variables to exclude in recording (processed after includes).
    """
    def __init__(self, out, **driver_kwargs):

        super(HDF5Recorder, self).__init__()
        self.out = File(out, 'w', **driver_kwargs)

        metadata_group = self.out.require_group('metadata')

        metadata_group.create_dataset('format_version', data=format_version)

    def record_metadata(self, group):
        """Stores the metadata of the given group in a HDF5 file using
        the variable name for the key.

        Args
        ----
        group : `System`
            `System` containing vectors
        """
        params = group.params.iteritems()
        resids = group.resids.iteritems()
        unknowns = group.unknowns.iteritems()

        metadata_group = self.out['metadata']

        # The group metadata could be anything so need to pickle it
        # There are other ways of storing any kind of Python object in HDF5 but this is the simplest
        system_metadata_val = np.array(
            pickle.dumps(group.metadata, pickle.HIGHEST_PROTOCOL))
        metadata_group.create_dataset('system_metadata',
                                      data=system_metadata_val)

        # Also store the model_viewer_data
        model_viewer_data = get_model_viewer_data(group)
        model_viewer_data_val = np.array(
            pickle.dumps(model_viewer_data, pickle.HIGHEST_PROTOCOL))
        metadata_group.create_dataset('model_viewer_data',
                                      data=model_viewer_data_val)

        pairings = (
            (metadata_group.create_group("Parameters"), params),
            (metadata_group.create_group("Unknowns"), unknowns),
        )

        for grp, data in pairings:
            for key, val in data:
                meta_group = grp.create_group(key)

                for mkey, mval in iteritems(val):
                    meta_group.create_dataset(mkey, data=mval)
                    # if isinstance(val, (np.ndarray, Number)):
                    #    grp.create_dataset(key, data=val)
                    #    # TODO: Compression/Checksum?
                    # else:
                    #    # TODO: Handling non-numeric data
                    #    msg = "HDF5 Recorder does not support data of type '{0}'".format(type(val))
                    #    raise NotImplementedError(msg)

    def record_iteration(self, params, unknowns, resids, metadata):
        """
        Stores the provided data in the HDF5 file using the iteration
        coordinate for the Group name.

        Args
        ----
        params : dict
            Dictionary containing parameters. (p)

        unknowns : dict
            Dictionary containing outputs and states. (u)

        resids : dict
            Dictionary containing residuals. (r)

        metadata : dict, optional
            Dictionary containing execution metadata (e.g. iteration coordinate).
        """

        iteration_coordinate = metadata['coord']
        group_name = format_iteration_coordinate(iteration_coordinate)

        f = self.out

        group = f.require_group(group_name)
        group.attrs['timestamp'] = metadata['timestamp']
        group.attrs['success'] = metadata['success']
        group.attrs['msg'] = metadata['msg']

        pairings = []

        if self.options['record_params']:
            p_group = group.create_group("Parameters")
            pairings.append(
                (p_group, self._filter_vector(params, 'p',
                                              iteration_coordinate)))

        if self.options['record_unknowns']:
            u_group = group.create_group("Unknowns")
            pairings.append((u_group,
                             self._filter_vector(unknowns, 'u',
                                                 iteration_coordinate)))

        if self.options['record_resids']:
            r_group = group.create_group("Residuals")
            pairings.append(
                (r_group, self._filter_vector(resids, 'r',
                                              iteration_coordinate)))

        for grp, data in pairings:
            for key, val in iteritems(data):
                if isinstance(val, (np.ndarray, Number)):
                    grp.create_dataset(key, data=val)
                    # TODO: Compression/Checksum?
                else:
                    # TODO: Handling non-numeric data
                    msg = "HDF5 Recorder does not support data of type '{0}'".format(
                        type(val))
                    raise NotImplementedError(msg)

    def record_derivatives(self, derivs, metadata):
        """Writes the derivatives that were calculated for the driver.

        Args
        ----
        derivs : dict
            Dictionary containing derivatives

        metadata : dict, optional
            Dictionary containing execution metadata (e.g. iteration coordinate).
        """

        iteration_coordinate = metadata['coord']
        group_name = format_iteration_coordinate(iteration_coordinate)

        # get the group for the iteration
        iteration_group = self.out[group_name]

        # Create a group under that called 'deriv'
        deriv_group = iteration_group.require_group('Derivs')

        # Then add timestamp, success, msg as attributes
        deriv_group.attrs['timestamp'] = metadata['timestamp']
        deriv_group.attrs['success'] = metadata['success']
        deriv_group.attrs['msg'] = metadata['msg']

        #  And actual deriv data. derivs could either be a dict or an ndarray
        #    depending on the optimizer
        if isinstance(derivs, np.ndarray):
            deriv_group.create_dataset('Derivatives', data=derivs)
        elif isinstance(derivs, OrderedDict):
            deriv_data_group = deriv_group.require_group('Derivatives')
            k = derivs.keys()
            for k, v in derivs.items():
                g = deriv_data_group.require_group(k)
                for k2, v2 in v.items():
                    g.create_dataset(k2, data=v2)
        else:
            raise ValueError(
                "Currently can only record derivatives that are ndarrays or OrderedDicts"
            )