def is_editable_h5(h5_obj): """ Returns True if the file containing the provided h5 object is in w or r+ modes Parameters ---------- h5_obj : h5py.File, h5py.Group, or h5py.Dataset object h5py object Returns ------- mode : bool True if the file containing the provided h5 object is in w or r+ modes """ warn('pyUSID.io.hdf_utils.is_editable_h5 has been moved to ' 'sidpy.hdf.hdf_utils.is_editable_h5. This copy in pyUSID will' 'be removed in future release. Please update your import statements') return hut.is_editable_h5(h5_obj)
def write_nsid_dataset(dataset, h5_group, main_data_name='', verbose=False, **kwargs): """ Writes the provided sid dataset as a 'Main' dataset with all appropriate linking. Parameters ---------- dataset : sidpy.Dataset Dataset to be written to HDF5 in NSID format h5_group : class:`h5py.Group` Parent group under which the datasets will be created main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character Use this to provide better context about the dataset in the HDF5 file verbose : bool, Optional. Default = False Whether or not to write logs to standard out kwargs: dict additional keyword arguments passed on to h5py when writing data Return ------ h5py dataset """ if not isinstance(dataset, Dataset): raise TypeError('data to write should be sidpy Dataset') if not isinstance(h5_group, (h5py.Group, h5py.File)): raise TypeError('h5_parent_group should be a h5py.File or h5py.Group ' 'object') if not isinstance(main_data_name, str): raise TypeError('main_data_name should be a string, but it instead it' ' is {}'.format(type(main_data_name))) if not is_editable_h5(h5_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') if not isinstance(main_data_name, str): raise TypeError('main_data_name must be a string') if main_data_name == '': if dataset.title.strip() == '': main_data_name = 'nDim_Data' else: main_data_name = dataset.title.split('/')[-1] main_data_name = main_data_name.strip() if '-' in main_data_name: warn('main_data_name should not contain the "-" character. Reformatted' ' name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') h5_group = h5_group.create_group(main_data_name) write_book_keeping_attrs(h5_group) write_pynsid_book_keeping_attrs(h5_group) ##################### # Write Main Dataset #################### if h5_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn('This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. ' 'Compression kwarg has been removed') if main_data_name in h5_group: raise ValueError('h5 dataset of that name already exists, choose ' 'different name or delete first') _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_group.create_dataset(main_data_name, shape=dataset.shape, dtype=dataset.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}' ''.format(h5_main, dataset)) print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"' ''.format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: dataset}) if verbose: print('Created dataset for Main') ################# # Add Dimensions ################# dimensional_dict = {} for i, this_dim in dataset._axes.items(): if not isinstance(this_dim, Dimension): raise ValueError('Dimensions {} is not a sidpy Dimension') this_dim_dset = h5_group.create_dataset(this_dim.name, data=this_dim.values) attrs_to_write = { 'name': this_dim.name, 'units': this_dim.units, 'quantity': this_dim.quantity, 'dimension_type': this_dim.dimension_type.name } write_simple_attrs(this_dim_dset, attrs_to_write) dimensional_dict[i] = this_dim_dset attrs_to_write = { 'quantity': dataset.quantity, 'units': dataset.units, 'main_data_name': dataset.title, 'data_type': dataset.data_type.name, 'modality': dataset.modality, 'source': dataset.source } write_simple_attrs(h5_main, attrs_to_write) write_pynsid_book_keeping_attrs(h5_main) for attr_name in dir(dataset): attr_val = getattr(dataset, attr_name) if isinstance(attr_val, dict): if verbose: print('Writing attributes from property: {} of the ' 'sidpy.Dataset'.format(attr_name)) write_dict_to_h5_group(h5_group, attr_val, attr_name) # This will attach the dimensions nsid_data_main = link_as_main(h5_main, dimensional_dict) if verbose: print('Successfully linked datasets - dataset should be main now') dataset.h5_dataset = nsid_data_main return nsid_data_main
def write_ind_val_dsets(h5_parent_group, dimensions, is_spectral=True, verbose=False, base_name=None, slow_to_fast=False): """ Creates h5py.Datasets for the position OR spectroscopic indices and values of the data. Remember that the contents of the dataset can be changed if need be after the creation of the datasets. For example if one of the spectroscopic dimensions (e.g. - Bias) was sinusoidal and not linear, The specific dimension in the Spectroscopic_Values dataset can be manually overwritten. Parameters ---------- h5_parent_group : :class:`h5py.Group` or :class:`h5py.File` Group under which the indices and values datasets will be created dimensions : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets is_spectral : bool, optional. default = True Spectroscopic (True) or Position (False) verbose : Boolean, optional Whether or not to print statements for debugging purposes base_name : str or unicode, optional Prefix for the datasets. Default: 'Position' when is_spectral is False, 'Spectroscopic' otherwise slow_to_fast : bool, Optional. Default=False Set to True if the dimensions are arranged from slowest varying to fastest varying. Set to False otherwise. Returns ------- h5_spec_inds : h5py.Dataset Dataset containing the position indices h5_spec_vals : h5py.Dataset Dataset containing the value at each position Notes ----- `steps`, `initial_values`, `labels`, and 'units' must be the same length as `dimensions` when they are specified. Dimensions should be in the order from fastest varying to slowest. """ if isinstance(dimensions, Dimension): dimensions = [dimensions] if not isinstance(dimensions, (list, np.ndarray, tuple)): raise TypeError('dimensions should be array-like ') if not np.all([isinstance(x, Dimension) for x in dimensions]): raise TypeError('dimensions should be a sequence of Dimension objects') if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError('h5_parent_group should be a h5py.File or Group object') if not is_editable_h5(h5_parent_group): raise ValueError('The provided h5 object is not valid / open') if base_name is not None: base_name = validate_single_string_arg(base_name, 'base_name') if not base_name.endswith('_'): base_name += '_' else: base_name = 'Position_' if is_spectral: base_name = 'Spectroscopic_' if not slow_to_fast: warn('In the future write_ind_val_dsets will default to requiring dimensions to be arranged from slowest to fastest varying') # check if the datasets already exist. If they do, there's no point in going any further for sub_name in ['Indices', 'Values']: if base_name + sub_name in h5_parent_group.keys(): raise KeyError('Dataset: {} already exists in provided group: {}'.format(base_name + sub_name, h5_parent_group.name)) modes = [dim.mode for dim in dimensions] sing_mode = np.unique(modes) if sing_mode.size > 1: raise NotImplementedError('Cannot yet work on combinations of modes for Dimensions. Consider doing manually') sing_mode = sing_mode[0] if sing_mode == DimType.DEFAULT: if slow_to_fast: # Ensure that the dimensions are arranged from fast to slow instead dimensions = dimensions[::-1] indices, values = build_ind_val_matrices([dim.values for dim in dimensions], is_spectral=is_spectral) # At this point, dimensions and unit values are arranged from fastest to slowest # We want dimensions to be arranged from slowest to fastest: rev_func = np.flipud if is_spectral else np.fliplr dimensions = dimensions[::-1] indices = rev_func(indices) values = rev_func(values) elif sing_mode == DimType.INCOMPLETE: lengths = np.unique([len(dim.values) for dim in dimensions]) if len(lengths) > 1: raise ValueError('Values for dimensions not of same length') single_dim = np.arange(lengths[0], dtype=INDICES_DTYPE) indices = np.tile(single_dim, (2, 1)).T values = np.dstack(tuple([dim.values for dim in dimensions])).squeeze() if is_spectral: indices = indices.T values = values.T else: raise NotImplementedError('Cannot yet work on Dependent dimensions') if verbose: print('Indices:') print(indices) print('Values:') print(values) # Create the Datasets for both Indices and Values h5_indices = h5_parent_group.create_dataset(base_name + 'Indices', data=INDICES_DTYPE(indices), dtype=INDICES_DTYPE) h5_values = h5_parent_group.create_dataset(base_name + 'Values', data=VALUES_DTYPE(values), dtype=VALUES_DTYPE) for h5_dset in [h5_indices, h5_values]: write_simple_attrs(h5_dset, {'units': [x.units for x in dimensions], 'labels': [x.name for x in dimensions], 'type': [dim.mode.value for dim in dimensions]}) warn('pyUSID.io.hdf_utils.simple.write_ind_val_dsets no longer creates' 'region references for each dimension. Please use ' 'pyUSID.io.reg_ref.write_region_references to manually create region ' 'references') return h5_indices, h5_values
def write_main_dataset(h5_parent_group, main_data, main_data_name, quantity, units, data_type, modality, source, dim_dict, main_dset_attrs=None, verbose=False, slow_to_fast=False, **kwargs): """ #TODO: Suhas to think about this a lot more Writes the provided data as a 'Main' dataset with all appropriate linking. By default, the instructions for generating dimension should be provided as a dictionary containing pyNSID-Dimensions or 1-Dim datasets The dimension-datasets can be shared with other main datasets; in this case, fresh datasets will not be generated. Parameters ---------- h5_parent_group : :class:`h5py.Group` Parent group under which the datasets will be created main_data : numpy.ndarray, dask.array.core.Array, list or tuple 2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset. If creating an empty dataset - the dtype must be specified via a kwarg. main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character. quantity : String / Unicode Name of the physical quantity stored in the dataset. Example - 'Current' units : String / Unicode Name of units for the quantity stored in the dataset. Example - 'A' for amperes data_type : `string : What kind of data this is. Example - image, image stack, video, hyperspectral image, etc. modality : `string : Experimental / simulation modality - scientific meaning of data. Example - photograph, TEM micrograph, SPM Force-Distance spectroscopy. source : `string : Source for dataset like the kind of instrument. dim_dict : Dictionary containing Dimension or h5PyDataset objects, that map each dimension to the specified dimension. E.g. {'0': position_X, '1': position_Y, 2: spectra} where position_X, position_Y, spectra can be either Dimensions or h5py datasets. Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Position indices and values datasets main_dset_attrs: dictionary, Optional, default = None flat dictionary of data to be added to the dataset, verbose : bool, Optional, default=False If set to true - prints debugging logs kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other arguments this way Returns ------- h5_main : NSIDataset Reference to the main dataset """ if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_parent_group should be a h5py.File or h5py.Group object') if not is_editable_h5(h5_parent_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') ##################### # Validate Main Data ##################### quantity, units, main_data_name, data_type, modality, source = validate_string_args( [quantity, units, main_data_name, data_type, modality, source], [ 'quantity', 'units', 'main_data_name', 'data_type', 'modality', 'source' ]) if verbose: print('quantity, units, main_data_name all OK') quantity = quantity.strip() units = units.strip() main_data_name = main_data_name.strip() if '-' in main_data_name: warn( 'main_data_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') if isinstance(main_data, (list, tuple)): if not contains_integers(main_data, min_val=1): raise ValueError( 'main_data if specified as a shape should be a list / tuple of integers >= 1' ) if len(main_data) < 1: raise ValueError( 'main_data if specified as a shape should contain at least 1 number for the singular dimension' ) if 'dtype' not in kwargs: raise ValueError( 'dtype must be included as a kwarg when creating an empty dataset' ) _ = validate_dtype(kwargs.get('dtype')) main_shape = main_data if verbose: print('Selected empty dataset creation. OK so far') elif isinstance(main_data, (np.ndarray, da.core.Array)): main_shape = main_data.shape if verbose: print('Provided numpy or Dask array for main_data OK so far') else: raise TypeError( 'main_data should either be a numpy array or a tuple / list with the shape of the data' ) ###################### # Validate Dimensions ###################### # An N dimensional dataset should have N items in the dimension dictionary if len(dim_dict) != len(main_shape): raise ValueError( 'Incorrect number of dimensions: {} provided to support main data, of shape: {}' .format(len(dim_dict), main_shape)) if set(range(len(main_shape))) != set(dim_dict.keys()): raise KeyError('') if False in validate_main_dimensions(main_shape, dim_dict, h5_parent_group): print('Dimensions incorrect') return if verbose: print('Dimensions are correct!') ##################### # Write Main Dataset #################### if h5_parent_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn( 'This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed' ) if main_data_name in h5_parent_group: print('Oops, dataset exits') #del h5_parent_group[main_data_name] return if isinstance(main_data, np.ndarray): # Case 1 - simple small dataset h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs) if verbose: print('Created main dataset with provided data') elif isinstance(main_data, da.core.Array): # Case 2 - Dask dataset # step 0 - get rid of any automated dtype specification: _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}'. format(h5_main, main_data)) print( 'Dask array will be written to HDF5 dataset: "{}" in file: "{}"' .format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data}) # main_data.to_hdf5(h5_main.file.filename, h5_main.name) # Does not work with python 2 for some reason else: # Case 3 - large empty dataset h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs) if verbose: print('Created empty dataset for Main') ################# # Add Dimensions ################# dimensional_dict = {} for i, this_dim in dim_dict.items(): if isinstance(this_dim, h5py.Dataset): this_dim_dset = this_dim if 'nsid_version' not in this_dim_dset.attrs: this_dim_dset.attrs['nsid_version'] = '0.0.1' #this_dim_dset[i] = this_dim elif isinstance(this_dim, Dimension): this_dim_dset = h5_parent_group.create_dataset( this_dim.name, data=this_dim.values) attrs_to_write = { 'name': this_dim.name, 'units': this_dim.units, 'quantity': this_dim.quantity, 'dimension_type': this_dim.dimension_type, 'nsid_version': '0.0.1' } write_simple_attrs(this_dim_dset, attrs_to_write) else: print(i, ' not a good dimension') pass dimensional_dict[i] = this_dim_dset attrs_to_write = { 'quantity': quantity, 'units': units, 'nsid_version': '0.0.1' } attrs_to_write['main_data_name'] = main_data_name attrs_to_write['data_type'] = data_type attrs_to_write['modality'] = modality attrs_to_write['source'] = source write_simple_attrs(h5_main, attrs_to_write) if verbose: print('Wrote dimensions and attributes to main dataset') if isinstance(main_dset_attrs, dict): write_simple_attrs(h5_main, main_dset_attrs) if verbose: print('Wrote provided attributes to main dataset') #ToDo: check if we need write_book_keeping_attrs(h5_main) NSID_data_main = link_as_main(h5_main, dimensional_dict) if verbose: print('Successfully linked datasets - dataset should be main now') return NSID_data_main #NSIDataset(h5_main)
def write_main_dataset(h5_parent_group, main_data, main_data_name, quantity, units, pos_dims, spec_dims, main_dset_attrs=None, h5_pos_inds=None, h5_pos_vals=None, h5_spec_inds=None, h5_spec_vals=None, aux_spec_prefix='Spectroscopic_', aux_pos_prefix='Position_', verbose=False, slow_to_fast=False, **kwargs): """ Writes the provided data as a 'Main' dataset with all appropriate linking. By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this case, fresh datasets will not be generated. Parameters ---------- h5_parent_group : :class:`h5py.Group` Parent group under which the datasets will be created main_data : numpy.ndarray, dask.array.core.Array, list or tuple 2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset. If creating an empty dataset - the dtype must be specified via a kwarg. main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character. quantity : String / Unicode Name of the physical quantity stored in the dataset. Example - 'Current' units : String / Unicode Name of units for the quantity stored in the dataset. Example - 'A' for amperes pos_dims : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Position indices and values datasets spec_dims : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Spectroscopic indices and values datasets main_dset_attrs : dictionary, Optional Dictionary of parameters that will be written to the main dataset. Do NOT include region references here. h5_pos_inds : h5py.Dataset, Optional Dataset that will be linked with the name "Position_Indices" h5_pos_vals : h5py.Dataset, Optional Dataset that will be linked with the name "Position_Values" h5_spec_inds : h5py.Dataset, Optional Dataset that will be linked with the name "Spectroscopic_Indices" h5_spec_vals : h5py.Dataset, Optional Dataset that will be linked with the name "Spectroscopic_Values" aux_spec_prefix : str or unicode, Optional Default prefix for Spectroscopic datasets. Default = "Spectroscopic" aux_pos_prefix : str or unicode, Optional Default prefix for Position datasets. Default = "Position" verbose : bool, Optional, default=False If set to true - prints debugging logs slow_to_fast : bool, Optional. Default=False Set to True if the dimensions are arranged from slowest varying to fastest varying. Set to False otherwise. kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other arguments this way Returns ------- h5_main : USIDataset Reference to the main dataset """ def __check_anc_before_creation(aux_prefix, dim_type='pos'): aux_prefix = validate_single_string_arg(aux_prefix, 'aux_' + dim_type + '_prefix') if not aux_prefix.endswith('_'): aux_prefix += '_' if '-' in aux_prefix: warn( 'aux_' + dim_type + ' should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(aux_prefix, aux_prefix.replace('-', '_'))) aux_prefix = aux_prefix.replace('-', '_') for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']: if dset_name in h5_parent_group.keys(): # TODO: What if the contained data was correct? raise KeyError( 'Dataset named: ' + dset_name + ' already exists in group: ' '{}. Consider passing these datasets using kwargs (if they are correct) instead of providing the pos_dims and spec_dims arguments' .format(h5_parent_group.name)) return aux_prefix def __ensure_anc_in_correct_file(h5_inds, h5_vals, prefix): if h5_inds.file != h5_vals.file: raise ValueError('Provided ' + prefix + ' datasets are present in different HDF5 files!') if h5_inds.file != h5_parent_group.file: # Need to copy over the anc datasets to the new group if verbose: print('Need to copy over ancillary datasets: {} and {} to ' 'destination group: {} which is in a different HDF5 ' 'file'.format(h5_inds, h5_vals, h5_parent_group)) ret_vals = [ copy_dataset(x, h5_parent_group, verbose=verbose) for x in [h5_inds, h5_vals] ] else: ret_vals = [h5_inds, h5_vals] return tuple(ret_vals) if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_parent_group should be a h5py.File or h5py.Group object') if not is_editable_h5(h5_parent_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') quantity, units, main_data_name = validate_string_args( [quantity, units, main_data_name], ['quantity', 'units', 'main_data_name']) if verbose: print('quantity, units, main_data_name all OK') quantity = quantity.strip() units = units.strip() main_data_name = main_data_name.strip() if '-' in main_data_name: warn( 'main_data_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') if isinstance(main_data, (list, tuple)): if not contains_integers(main_data, min_val=1): raise ValueError( 'main_data if specified as a shape should be a list / tuple of integers >= 1' ) if len(main_data) != 2: raise ValueError( 'main_data if specified as a shape should contain 2 numbers') if 'dtype' not in kwargs: raise ValueError( 'dtype must be included as a kwarg when creating an empty dataset' ) _ = validate_dtype(kwargs.get('dtype')) main_shape = main_data if verbose: print('Selected empty dataset creation. OK so far') elif isinstance(main_data, (np.ndarray, da.core.Array)): if main_data.ndim != 2: raise ValueError('main_data should be a 2D array') main_shape = main_data.shape if verbose: print('Provided numpy or Dask array for main_data OK so far') else: raise TypeError( 'main_data should either be a numpy array or a tuple / list with the shape of the data' ) if h5_pos_inds is not None and h5_pos_vals is not None: # The provided datasets override fresh building instructions. validate_anc_h5_dsets(h5_pos_inds, h5_pos_vals, main_shape, is_spectroscopic=False) if verbose: print( 'The shapes of the provided h5 position indices and values are OK' ) h5_pos_inds, h5_pos_vals = __ensure_anc_in_correct_file( h5_pos_inds, h5_pos_vals, 'Position') else: aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix, dim_type='pos') pos_dims = validate_dimensions(pos_dims, dim_type='Position') validate_dims_against_main(main_shape, pos_dims, is_spectroscopic=False) if verbose: print('Passed all pre-tests for creating position datasets') h5_pos_inds, h5_pos_vals = write_ind_val_dsets( h5_parent_group, pos_dims, is_spectral=False, verbose=verbose, slow_to_fast=slow_to_fast, base_name=aux_pos_prefix) if verbose: print('Created position datasets!') if h5_spec_inds is not None and h5_spec_vals is not None: # The provided datasets override fresh building instructions. validate_anc_h5_dsets(h5_spec_inds, h5_spec_vals, main_shape, is_spectroscopic=True) if verbose: print('The shapes of the provided h5 position indices and values ' 'are OK') h5_spec_inds, h5_spec_vals = __ensure_anc_in_correct_file( h5_spec_inds, h5_spec_vals, 'Spectroscopic') else: aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix, dim_type='spec') spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic') validate_dims_against_main(main_shape, spec_dims, is_spectroscopic=True) if verbose: print('Passed all pre-tests for creating spectroscopic datasets') h5_spec_inds, h5_spec_vals = write_ind_val_dsets( h5_parent_group, spec_dims, is_spectral=True, verbose=verbose, slow_to_fast=slow_to_fast, base_name=aux_spec_prefix) if verbose: print('Created Spectroscopic datasets') if h5_parent_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn( 'This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed' ) if isinstance(main_data, np.ndarray): # Case 1 - simple small dataset h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs) if verbose: print('Created main dataset with provided data') elif isinstance(main_data, da.core.Array): # Case 2 - Dask dataset # step 0 - get rid of any automated dtype specification: _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}'. format(h5_main, main_data)) print( 'Dask array will be written to HDF5 dataset: "{}" in file: "{}"' .format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data}) # main_data.to_hdf5(h5_main.file.filename, h5_main.name) # Does not work with python 2 for some reason else: # Case 3 - large empty dataset h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs) if verbose: print('Created empty dataset for Main') write_simple_attrs(h5_main, {'quantity': quantity, 'units': units}) if verbose: print('Wrote quantity and units attributes to main dataset') if isinstance(main_dset_attrs, dict): write_simple_attrs(h5_main, main_dset_attrs) if verbose: print('Wrote provided attributes to main dataset') write_book_keeping_attrs(h5_main) # make it main link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals) if verbose: print('Successfully linked datasets - dataset should be main now') from ..usi_data import USIDataset return USIDataset(h5_main)
def write_nsid_dataset(dataset, h5_group, main_data_name='', verbose=False, **kwargs): """ Writes the provided sid dataset as a 'Main' dataset with all appropriate linking. Parameters ---------- dataset: main_data : sidpy Dataset h5_group : class:`h5py.Group` Parent group under which the datasets will be created main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character. verbose: boolean kwargs: additional h5py parameters Return ------ h5py dataset """ if not isinstance(dataset, Dataset): raise ValueError('data to write should be sidpy Dataset') if not isinstance(h5_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_parent_group should be a h5py.File or h5py.Group object') if not is_editable_h5(h5_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') if main_data_name == '': if dataset.title.strip() == '': main_data_name = 'nDim_Data' else: main_data_name = dataset.title.split('/')[-1] main_data_name = main_data_name.strip() if '-' in main_data_name: warn( 'main_data_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') ##################### # Write Main Dataset #################### if h5_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn( 'This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed' ) print(h5_group, main_data_name) if main_data_name in h5_group: raise ValueError( 'h5 dataset of that name already exists, choose different name or delete first' ) _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_group.create_dataset(main_data_name, shape=dataset.shape, dtype=dataset.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}'.format( h5_main, dataset)) print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"'. format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: dataset}) if verbose: print('Created dataset for Main') ################# # Add Dimensions ################# dimensional_dict = {} for i, this_dim in dataset.axes.items(): if not isinstance(this_dim, Dimension): raise ValueError('Dimensions {} is not a sidpy Dimension') this_dim_dset = h5_group.create_dataset(this_dim.name, data=this_dim.values) attrs_to_write = { 'name': this_dim.name, 'units': this_dim.units, 'quantity': this_dim.quantity, 'dimension_type': this_dim.dimension_type, 'nsid_version': version } write_simple_attrs(this_dim_dset, attrs_to_write) dimensional_dict[i] = this_dim_dset attrs_to_write = { 'quantity': dataset.quantity, 'units': dataset.units, 'nsid_version': version, 'main_data_name': dataset.title, 'data_type': dataset.data_type, 'modality': dataset.modality, 'source': dataset.source } write_simple_attrs(h5_main, attrs_to_write) # dset = write_main_dataset(h5_group, np.array(dataset), main_data_name, # dataset.quantity, dataset.units, dataset.data_type, dataset.modality, # dataset.source, dataset.axes, verbose=False) for key, item in dataset.attrs.items(): if key not in attrs_to_write: # TODO: Check item to be simple h5_main.attrs[key] = item original_group = h5_group.create_group('original_metadata') for key, item in dataset.original_metadata.items(): original_group.attrs[key] = item if hasattr(dataset, 'aberrations'): aberrations_group = h5_group.create_group('aberrations') for key, item in dataset.aberrations.items(): aberrations_group.attrs[key] = item if hasattr(dataset, 'annotations'): annotations_group = h5_group.create_group('annotations') for key, item in dataset.annotations.items(): annotations_group.attrs[key] = item # ToDo: check if we need write_book_keeping_attrs(h5_main) # This will attach the dimensions nsid_data_main = link_as_main(h5_main, dimensional_dict) if verbose: print('Successfully linked datasets - dataset should be main now') return nsid_data_main # NSIDataset(h5_main)