Exemplo n.º 1
0
def write_main_dataset(h5_parent_group,
                       main_data,
                       main_data_name,
                       quantity,
                       units,
                       pos_dims,
                       spec_dims,
                       main_dset_attrs=None,
                       h5_pos_inds=None,
                       h5_pos_vals=None,
                       h5_spec_inds=None,
                       h5_spec_vals=None,
                       aux_spec_prefix='Spectroscopic_',
                       aux_pos_prefix='Position_',
                       verbose=False,
                       **kwargs):
    """
    Writes the provided data as a 'Main' dataset with all appropriate linking.
    By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and
    spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already
    available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this
    case, fresh datasets will not be generated.

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group`
        Parent group under which the datasets will be created
    main_data : numpy.ndarray, dask.array.core.Array, list or tuple
        2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset.
        If creating an empty dataset - the dtype must be specified via a kwarg.
    main_data_name : String / Unicode
        Name to give to the main dataset. This cannot contain the '-' character.
    quantity : String / Unicode
        Name of the physical quantity stored in the dataset. Example - 'Current'
    units : String / Unicode
        Name of units for the quantity stored in the dataset. Example - 'A' for amperes
    pos_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Position indices and values datasets
    spec_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Spectroscopic indices and values datasets
    main_dset_attrs : dictionary, Optional
        Dictionary of parameters that will be written to the main dataset. Do NOT include region references here.
    h5_pos_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Indices"
    h5_pos_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Values"
    h5_spec_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Indices"
    h5_spec_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Values"
    aux_spec_prefix : str or unicode, Optional
        Default prefix for Spectroscopic datasets. Default = "Spectroscopic"
    aux_pos_prefix : str or unicode, Optional
        Default prefix for Position datasets. Default = "Position"
    verbose : bool, Optional, default=False
        If set to true - prints debugging logs
    kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other
        arguments this way

    Returns
    -------
    h5_main : USIDataset
        Reference to the main dataset

    """
    def __check_anc_before_creation(aux_prefix, dim_type='pos'):
        aux_prefix = validate_single_string_arg(aux_prefix,
                                                'aux_' + dim_type + '_prefix')
        if not aux_prefix.endswith('_'):
            aux_prefix += '_'
        if '-' in aux_prefix:
            warn(
                'aux_' + dim_type +
                ' should not contain the "-" character. Reformatted name from:{} to '
                '{}'.format(aux_prefix, aux_prefix.replace('-', '_')))
        aux_prefix = aux_prefix.replace('-', '_')
        for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']:
            if dset_name in h5_parent_group.keys():
                raise KeyError('Dataset named: ' + dset_name +
                               ' already exists in group: '
                               '{}'.format(h5_parent_group.name))
        return aux_prefix

    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError(
            'h5_parent_group should be a h5py.File or h5py.Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    quantity, units, main_data_name = validate_string_args(
        [quantity, units, main_data_name],
        ['quantity', 'units', 'main_data_name'])
    if verbose:
        print('quantity, units, main_data_name all OK')

    quantity = quantity.strip()
    units = units.strip()
    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn(
            'main_data_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    if isinstance(main_data, (list, tuple)):
        if not contains_integers(main_data, min_val=1):
            raise ValueError(
                'main_data if specified as a shape should be a list / tuple of integers >= 1'
            )
        if len(main_data) != 2:
            raise ValueError(
                'main_data if specified as a shape should contain 2 numbers')
        if 'dtype' not in kwargs:
            raise ValueError(
                'dtype must be included as a kwarg when creating an empty dataset'
            )
        _ = validate_dtype(kwargs.get('dtype'))
        main_shape = main_data
        if verbose:
            print('Selected empty dataset creation. OK so far')
    elif isinstance(main_data, (np.ndarray, da.core.Array)):
        if main_data.ndim != 2:
            raise ValueError('main_data should be a 2D array')
        main_shape = main_data.shape
        if verbose:
            print('Provided numpy or Dask array for main_data OK so far')
    else:
        raise TypeError(
            'main_data should either be a numpy array or a tuple / list with the shape of the data'
        )

    if h5_pos_inds is not None and h5_pos_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_pos_inds,
                              h5_pos_vals,
                              main_shape,
                              is_spectroscopic=False)
        if verbose:
            print('Provided h5 position indices and values OK')
    else:
        aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix,
                                                     dim_type='pos')
        pos_dims = validate_dimensions(pos_dims, dim_type='Position')
        validate_dims_against_main(main_shape,
                                   pos_dims,
                                   is_spectroscopic=False)
        if verbose:
            print('Passed all pre-tests for creating position datasets')
        h5_pos_inds, h5_pos_vals = write_ind_val_dsets(
            h5_parent_group,
            pos_dims,
            is_spectral=False,
            verbose=verbose,
            base_name=aux_pos_prefix)
        if verbose:
            print('Created position datasets!')

    if h5_spec_inds is not None and h5_spec_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_spec_inds,
                              h5_spec_vals,
                              main_shape,
                              is_spectroscopic=True)
        if verbose:
            print('Provided h5 spectroscopic datasets were OK')
    else:
        aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix,
                                                      dim_type='spec')
        spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic')
        validate_dims_against_main(main_shape,
                                   spec_dims,
                                   is_spectroscopic=True)
        if verbose:
            print('Passed all pre-tests for creating spectroscopic datasets')
        h5_spec_inds, h5_spec_vals = write_ind_val_dsets(
            h5_parent_group,
            spec_dims,
            is_spectral=True,
            verbose=verbose,
            base_name=aux_spec_prefix)
        if verbose:
            print('Created Spectroscopic datasets')

    if h5_parent_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn(
                'This HDF5 file has been opened wth the "mpio" communicator. '
                'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed'
            )

    if isinstance(main_data, np.ndarray):
        # Case 1 - simple small dataset
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 data=main_data,
                                                 **kwargs)
        if verbose:
            print('Created main dataset with provided data')
    elif isinstance(main_data, da.core.Array):
        # Case 2 - Dask dataset
        # step 0 - get rid of any automated dtype specification:
        _ = kwargs.pop('dtype', None)
        # step 1 - create the empty dataset:
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 shape=main_data.shape,
                                                 dtype=main_data.dtype,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset: {} for writing Dask dataset: {}'.
                  format(h5_main, main_data))
            print(
                'Dask array will be written to HDF5 dataset: "{}" in file: "{}"'
                .format(h5_main.name, h5_main.file.filename))
        # Step 2 - now ask Dask to dump data to disk
        da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
        # main_data.to_hdf5(h5_main.file.filename, h5_main.name)  # Does not work with python 2 for some reason
    else:
        # Case 3 - large empty dataset
        h5_main = h5_parent_group.create_dataset(main_data_name, main_data,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset for Main')

    write_simple_attrs(h5_main, {'quantity': quantity, 'units': units})
    if verbose:
        print('Wrote quantity and units attributes to main dataset')

    if isinstance(main_dset_attrs, dict):
        write_simple_attrs(h5_main, main_dset_attrs)
        if verbose:
            print('Wrote provided attributes to main dataset')

    write_book_keeping_attrs(h5_main)

    # make it main
    link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals)
    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    from ..usi_data import USIDataset
    return USIDataset(h5_main)
Exemplo n.º 2
0
    def translate(self, h5_path, data_name, raw_data, quantity, units, pos_dims, spec_dims,
                  translator_name='ArrayTranslator', parm_dict=None, extra_dsets=None, **kwargs):
        """
        Writes the provided datasets and parameters to an h5 file

        Parameters
        ----------
        h5_path : str
            Absolute path of the h5 file to be written
        data_name : str
            Name of the scientific data type. Example - 'SEM'
        raw_data : :class:`np.ndarray` or :class:`dask.array.core.Array`
            2D matrix formatted as [position, spectral]
        quantity : str
            Name of the physical quantity stored in the dataset. Example - 'Current'
        units : str
            Name of units for the quantity stored in the dataset. Example - 'A' for amperes
        pos_dims : :class:`~pyUSID.io.write_utils.Dimension` or array-like of :class:`~pyUSID.io.write_utils.Dimension`
            objects
            Sequence of :class:`~pyUSID.io.write_utils.Dimension` objects that provides all necessary instructions for
            constructing the indices and values datasets
            Object specifying the instructions necessary for building the Position indices and values datasets
        spec_dims : :class:`~pyUSID.io.write_utils.Dimension` or array-like of :class:`~pyUSID.io.write_utils.Dimension`
            objects
            Sequence of :class:`~pyUSID.io.write_utils.Dimension` objects that provides all necessary instructions for
            constructing the indices and values datasets
            Object specifying the instructions necessary for building the Spectroscopic indices and values datasets
        translator_name : str, Optional
            Name of the translator. Example - 'HitachiSEMTranslator'
        parm_dict : dict, Optional
            Dictionary of parameters that will be written under the group 'Measurement_000'
        extra_dsets : dict, Optional
            Dictionary whose values will be written into individual HDF5 datasets and whose corresponding keys provide
            the names of the datasets. You are recommended to limit these to simple and small datasets.
        kwargs: dict, Optional.
            Additional keyword arguments that will be passed onto :meth:`pyUSID.hdf_utils.write_main_dset()` which will
            in turn will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other
            arguments this way

        Returns
        -------
        h5_path : str
            Absolute path of the written h5 file

        """

        h5_path, data_name, translator_name, quantity, units = validate_string_args([h5_path, data_name,
                                                                                     translator_name, quantity, units],
                                                                                    ['h5_path', 'data_name',
                                                                                     'translator_name', 'quantity',
                                                                                     'units'])
        validate_main_dset(raw_data, False)

        for dimensions, dim_name in zip([pos_dims, spec_dims], ['Position', 'Spectroscopic']):
            dimensions = validate_dimensions(dimensions, dim_type=dim_name)
            validate_dims_against_main(raw_data.shape, dimensions, dim_name == 'Spectroscopic')

        if extra_dsets is not None:
            if not isinstance(extra_dsets, dict):
                raise TypeError('extra_dsets should be specified as dictionaries')
            for key, val in extra_dsets.items():
                [key] = validate_string_args(key, 'keys for extra_dsets')
                if np.any([key in x for x in ['Spectroscopic_Indices', 'Spectroscopic_Values', 'Position_Indices',
                                              'Position_Values', 'Raw_Data']]):
                    raise KeyError('keys for extra_dsets cannot match reserved names for existing datasets')
                # Now check for data:
                if not isinstance(val, (list, tuple, np.ndarray, da.core.Array)):
                    raise TypeError('values for extra_dsets should be a tuple, list, or numpy / dask array')
        else:
            extra_dsets = dict()

        if path.exists(h5_path):
            remove(h5_path)

        if parm_dict is None:
            parm_dict = {}

        global_parms = generate_dummy_main_parms()
        global_parms['data_type'] = data_name
        global_parms['translator'] = translator_name

        # Begin writing to file:
        with h5py.File(h5_path) as h5_f:

            # Root attributes first:
            write_simple_attrs(h5_f, global_parms)
            write_book_keeping_attrs(h5_f)

            # measurement group next
            meas_grp = create_indexed_group(h5_f, 'Measurement')
            write_simple_attrs(meas_grp, parm_dict)

            # channel group next
            chan_grp = create_indexed_group(meas_grp, 'Channel')

            _ = write_main_dataset(chan_grp, raw_data, 'Raw_Data', quantity, units, pos_dims, spec_dims, **kwargs)

            for key, val in extra_dsets.items():
                if isinstance(val, da.core.Array):
                    da.to_hdf5(chan_grp.file.filename, {chan_grp.name + '/' + key: val})
                else:
                    chan_grp.create_dataset(key.strip(), data=val)

        return h5_path