def write_main_dataset(h5_parent_group, main_data, main_data_name, quantity, units, pos_dims, spec_dims, main_dset_attrs=None, h5_pos_inds=None, h5_pos_vals=None, h5_spec_inds=None, h5_spec_vals=None, aux_spec_prefix='Spectroscopic_', aux_pos_prefix='Position_', verbose=False, **kwargs): """ Writes the provided data as a 'Main' dataset with all appropriate linking. By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this case, fresh datasets will not be generated. Parameters ---------- h5_parent_group : :class:`h5py.Group` Parent group under which the datasets will be created main_data : numpy.ndarray, dask.array.core.Array, list or tuple 2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset. If creating an empty dataset - the dtype must be specified via a kwarg. main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character. quantity : String / Unicode Name of the physical quantity stored in the dataset. Example - 'Current' units : String / Unicode Name of units for the quantity stored in the dataset. Example - 'A' for amperes pos_dims : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Position indices and values datasets spec_dims : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Spectroscopic indices and values datasets main_dset_attrs : dictionary, Optional Dictionary of parameters that will be written to the main dataset. Do NOT include region references here. h5_pos_inds : h5py.Dataset, Optional Dataset that will be linked with the name "Position_Indices" h5_pos_vals : h5py.Dataset, Optional Dataset that will be linked with the name "Position_Values" h5_spec_inds : h5py.Dataset, Optional Dataset that will be linked with the name "Spectroscopic_Indices" h5_spec_vals : h5py.Dataset, Optional Dataset that will be linked with the name "Spectroscopic_Values" aux_spec_prefix : str or unicode, Optional Default prefix for Spectroscopic datasets. Default = "Spectroscopic" aux_pos_prefix : str or unicode, Optional Default prefix for Position datasets. Default = "Position" verbose : bool, Optional, default=False If set to true - prints debugging logs kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other arguments this way Returns ------- h5_main : USIDataset Reference to the main dataset """ def __check_anc_before_creation(aux_prefix, dim_type='pos'): aux_prefix = validate_single_string_arg(aux_prefix, 'aux_' + dim_type + '_prefix') if not aux_prefix.endswith('_'): aux_prefix += '_' if '-' in aux_prefix: warn( 'aux_' + dim_type + ' should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(aux_prefix, aux_prefix.replace('-', '_'))) aux_prefix = aux_prefix.replace('-', '_') for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']: if dset_name in h5_parent_group.keys(): raise KeyError('Dataset named: ' + dset_name + ' already exists in group: ' '{}'.format(h5_parent_group.name)) return aux_prefix if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_parent_group should be a h5py.File or h5py.Group object') if not is_editable_h5(h5_parent_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') quantity, units, main_data_name = validate_string_args( [quantity, units, main_data_name], ['quantity', 'units', 'main_data_name']) if verbose: print('quantity, units, main_data_name all OK') quantity = quantity.strip() units = units.strip() main_data_name = main_data_name.strip() if '-' in main_data_name: warn( 'main_data_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') if isinstance(main_data, (list, tuple)): if not contains_integers(main_data, min_val=1): raise ValueError( 'main_data if specified as a shape should be a list / tuple of integers >= 1' ) if len(main_data) != 2: raise ValueError( 'main_data if specified as a shape should contain 2 numbers') if 'dtype' not in kwargs: raise ValueError( 'dtype must be included as a kwarg when creating an empty dataset' ) _ = validate_dtype(kwargs.get('dtype')) main_shape = main_data if verbose: print('Selected empty dataset creation. OK so far') elif isinstance(main_data, (np.ndarray, da.core.Array)): if main_data.ndim != 2: raise ValueError('main_data should be a 2D array') main_shape = main_data.shape if verbose: print('Provided numpy or Dask array for main_data OK so far') else: raise TypeError( 'main_data should either be a numpy array or a tuple / list with the shape of the data' ) if h5_pos_inds is not None and h5_pos_vals is not None: # The provided datasets override fresh building instructions. validate_anc_h5_dsets(h5_pos_inds, h5_pos_vals, main_shape, is_spectroscopic=False) if verbose: print('Provided h5 position indices and values OK') else: aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix, dim_type='pos') pos_dims = validate_dimensions(pos_dims, dim_type='Position') validate_dims_against_main(main_shape, pos_dims, is_spectroscopic=False) if verbose: print('Passed all pre-tests for creating position datasets') h5_pos_inds, h5_pos_vals = write_ind_val_dsets( h5_parent_group, pos_dims, is_spectral=False, verbose=verbose, base_name=aux_pos_prefix) if verbose: print('Created position datasets!') if h5_spec_inds is not None and h5_spec_vals is not None: # The provided datasets override fresh building instructions. validate_anc_h5_dsets(h5_spec_inds, h5_spec_vals, main_shape, is_spectroscopic=True) if verbose: print('Provided h5 spectroscopic datasets were OK') else: aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix, dim_type='spec') spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic') validate_dims_against_main(main_shape, spec_dims, is_spectroscopic=True) if verbose: print('Passed all pre-tests for creating spectroscopic datasets') h5_spec_inds, h5_spec_vals = write_ind_val_dsets( h5_parent_group, spec_dims, is_spectral=True, verbose=verbose, base_name=aux_spec_prefix) if verbose: print('Created Spectroscopic datasets') if h5_parent_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn( 'This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed' ) if isinstance(main_data, np.ndarray): # Case 1 - simple small dataset h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs) if verbose: print('Created main dataset with provided data') elif isinstance(main_data, da.core.Array): # Case 2 - Dask dataset # step 0 - get rid of any automated dtype specification: _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}'. format(h5_main, main_data)) print( 'Dask array will be written to HDF5 dataset: "{}" in file: "{}"' .format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data}) # main_data.to_hdf5(h5_main.file.filename, h5_main.name) # Does not work with python 2 for some reason else: # Case 3 - large empty dataset h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs) if verbose: print('Created empty dataset for Main') write_simple_attrs(h5_main, {'quantity': quantity, 'units': units}) if verbose: print('Wrote quantity and units attributes to main dataset') if isinstance(main_dset_attrs, dict): write_simple_attrs(h5_main, main_dset_attrs) if verbose: print('Wrote provided attributes to main dataset') write_book_keeping_attrs(h5_main) # make it main link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals) if verbose: print('Successfully linked datasets - dataset should be main now') from ..usi_data import USIDataset return USIDataset(h5_main)
def translate(self, h5_path, data_name, raw_data, quantity, units, pos_dims, spec_dims, translator_name='ArrayTranslator', parm_dict=None, extra_dsets=None, **kwargs): """ Writes the provided datasets and parameters to an h5 file Parameters ---------- h5_path : str Absolute path of the h5 file to be written data_name : str Name of the scientific data type. Example - 'SEM' raw_data : :class:`np.ndarray` or :class:`dask.array.core.Array` 2D matrix formatted as [position, spectral] quantity : str Name of the physical quantity stored in the dataset. Example - 'Current' units : str Name of units for the quantity stored in the dataset. Example - 'A' for amperes pos_dims : :class:`~pyUSID.io.write_utils.Dimension` or array-like of :class:`~pyUSID.io.write_utils.Dimension` objects Sequence of :class:`~pyUSID.io.write_utils.Dimension` objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Position indices and values datasets spec_dims : :class:`~pyUSID.io.write_utils.Dimension` or array-like of :class:`~pyUSID.io.write_utils.Dimension` objects Sequence of :class:`~pyUSID.io.write_utils.Dimension` objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Spectroscopic indices and values datasets translator_name : str, Optional Name of the translator. Example - 'HitachiSEMTranslator' parm_dict : dict, Optional Dictionary of parameters that will be written under the group 'Measurement_000' extra_dsets : dict, Optional Dictionary whose values will be written into individual HDF5 datasets and whose corresponding keys provide the names of the datasets. You are recommended to limit these to simple and small datasets. kwargs: dict, Optional. Additional keyword arguments that will be passed onto :meth:`pyUSID.hdf_utils.write_main_dset()` which will in turn will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other arguments this way Returns ------- h5_path : str Absolute path of the written h5 file """ h5_path, data_name, translator_name, quantity, units = validate_string_args([h5_path, data_name, translator_name, quantity, units], ['h5_path', 'data_name', 'translator_name', 'quantity', 'units']) validate_main_dset(raw_data, False) for dimensions, dim_name in zip([pos_dims, spec_dims], ['Position', 'Spectroscopic']): dimensions = validate_dimensions(dimensions, dim_type=dim_name) validate_dims_against_main(raw_data.shape, dimensions, dim_name == 'Spectroscopic') if extra_dsets is not None: if not isinstance(extra_dsets, dict): raise TypeError('extra_dsets should be specified as dictionaries') for key, val in extra_dsets.items(): [key] = validate_string_args(key, 'keys for extra_dsets') if np.any([key in x for x in ['Spectroscopic_Indices', 'Spectroscopic_Values', 'Position_Indices', 'Position_Values', 'Raw_Data']]): raise KeyError('keys for extra_dsets cannot match reserved names for existing datasets') # Now check for data: if not isinstance(val, (list, tuple, np.ndarray, da.core.Array)): raise TypeError('values for extra_dsets should be a tuple, list, or numpy / dask array') else: extra_dsets = dict() if path.exists(h5_path): remove(h5_path) if parm_dict is None: parm_dict = {} global_parms = generate_dummy_main_parms() global_parms['data_type'] = data_name global_parms['translator'] = translator_name # Begin writing to file: with h5py.File(h5_path) as h5_f: # Root attributes first: write_simple_attrs(h5_f, global_parms) write_book_keeping_attrs(h5_f) # measurement group next meas_grp = create_indexed_group(h5_f, 'Measurement') write_simple_attrs(meas_grp, parm_dict) # channel group next chan_grp = create_indexed_group(meas_grp, 'Channel') _ = write_main_dataset(chan_grp, raw_data, 'Raw_Data', quantity, units, pos_dims, spec_dims, **kwargs) for key, val in extra_dsets.items(): if isinstance(val, da.core.Array): da.to_hdf5(chan_grp.file.filename, {chan_grp.name + '/' + key: val}) else: chan_grp.create_dataset(key.strip(), data=val) return h5_path