def write_book_keeping_attrs(h5_obj): """ Writes basic book-keeping and posterity related attributes to groups created in pyNSID such as machine id, pyNSID version, timestamp. Parameters ---------- h5_obj : :class:`h5py.Dataset`, :class:`h5py.Group`, or :class:`h5py.File` Object to which basic book-keeping attributes need to be written """ hut.write_book_keeping_attrs(h5_obj) hut.write_simple_attrs(h5_obj, {'pyNSID_version': py_nsid_version})
def write_pynsid_book_keeping_attrs(h5_object): """ Writes book-keeping information to the HDF5 object Parameters ---------- h5_object Returns ------- """ write_book_keeping_attrs(h5_object) write_simple_attrs(h5_object, {'pyNSID_version': pynsid_version})
def create_indexed_group(h5_parent_group, base_name): """ Creates a group with an indexed name (eg - 'Measurement_012') under ``h5_parent_group`` using the provided ``base_name`` as a prefix for the group's name Parameters ---------- h5_parent_group : :class:`h5py.Group` or :class:`h5py.File` File or group within which the new group will be created base_name : str or unicode Prefix for the group name. This need not end with a '_'. It will be added automatically """ if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_parent_group should be a h5py.File or Group object') base_name = validate_single_string_arg(base_name, 'base_name') group_name = assign_group_index(h5_parent_group, base_name) h5_new_group = h5_parent_group.create_group(group_name) write_book_keeping_attrs(h5_new_group) return h5_new_group
def write_nsid_dataset(dataset, h5_group, main_data_name='', verbose=False, **kwargs): """ Writes the provided sid dataset as a 'Main' dataset with all appropriate linking. Parameters ---------- dataset : sidpy.Dataset Dataset to be written to HDF5 in NSID format h5_group : class:`h5py.Group` Parent group under which the datasets will be created main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character Use this to provide better context about the dataset in the HDF5 file verbose : bool, Optional. Default = False Whether or not to write logs to standard out kwargs: dict additional keyword arguments passed on to h5py when writing data Return ------ h5py dataset """ if not isinstance(dataset, Dataset): raise TypeError('data to write should be sidpy Dataset') if not isinstance(h5_group, (h5py.Group, h5py.File)): raise TypeError('h5_parent_group should be a h5py.File or h5py.Group ' 'object') if not isinstance(main_data_name, str): raise TypeError('main_data_name should be a string, but it instead it' ' is {}'.format(type(main_data_name))) if not is_editable_h5(h5_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') if not isinstance(main_data_name, str): raise TypeError('main_data_name must be a string') if main_data_name == '': if dataset.title.strip() == '': main_data_name = 'nDim_Data' else: main_data_name = dataset.title.split('/')[-1] main_data_name = main_data_name.strip() if '-' in main_data_name: warn('main_data_name should not contain the "-" character. Reformatted' ' name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') h5_group = h5_group.create_group(main_data_name) write_book_keeping_attrs(h5_group) write_pynsid_book_keeping_attrs(h5_group) ##################### # Write Main Dataset #################### if h5_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn('This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. ' 'Compression kwarg has been removed') if main_data_name in h5_group: raise ValueError('h5 dataset of that name already exists, choose ' 'different name or delete first') _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_group.create_dataset(main_data_name, shape=dataset.shape, dtype=dataset.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}' ''.format(h5_main, dataset)) print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"' ''.format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: dataset}) if verbose: print('Created dataset for Main') ################# # Add Dimensions ################# dimensional_dict = {} for i, this_dim in dataset._axes.items(): if not isinstance(this_dim, Dimension): raise ValueError('Dimensions {} is not a sidpy Dimension') this_dim_dset = h5_group.create_dataset(this_dim.name, data=this_dim.values) attrs_to_write = { 'name': this_dim.name, 'units': this_dim.units, 'quantity': this_dim.quantity, 'dimension_type': this_dim.dimension_type.name } write_simple_attrs(this_dim_dset, attrs_to_write) dimensional_dict[i] = this_dim_dset attrs_to_write = { 'quantity': dataset.quantity, 'units': dataset.units, 'main_data_name': dataset.title, 'data_type': dataset.data_type.name, 'modality': dataset.modality, 'source': dataset.source } write_simple_attrs(h5_main, attrs_to_write) write_pynsid_book_keeping_attrs(h5_main) for attr_name in dir(dataset): attr_val = getattr(dataset, attr_name) if isinstance(attr_val, dict): if verbose: print('Writing attributes from property: {} of the ' 'sidpy.Dataset'.format(attr_name)) write_dict_to_h5_group(h5_group, attr_val, attr_name) # This will attach the dimensions nsid_data_main = link_as_main(h5_main, dimensional_dict) if verbose: print('Successfully linked datasets - dataset should be main now') dataset.h5_dataset = nsid_data_main return nsid_data_main
def write_results(h5_group, dataset=None, attributes=None, process_name=None): """ Writes results of a processing step back to HDF5 in NSID format Parameters ---------- h5_group : h5py.Group HDF5 Group into which results will be written dataset : sidpy.Dataset, optional. Default = None Dataset ?? attributes : dict, optional. Default = None Metadata regarding processing step process_name : str, optional. Default = "Log_" Name of the prefix for group containing process results Returns ------- log_group : h5py.Group HDF5 group containing results """ found_valid_dataset = False if dataset is not None: if isinstance(dataset, Dataset): dataset = [dataset] if isinstance(dataset, list): if not all([isinstance(itm, Dataset) for itm in dataset]): raise TypeError('List contains non-Sidpy dataset entries! ' 'Should only contain sidpy datasets') found_valid_dataset = True found_valid_attributes = False if attributes is not None: if isinstance(attributes, dict): if len(attributes) > 0: found_valid_attributes = True else: raise TypeError("Provided attributes is type {} but should be type" " dict".format(type(attributes))) if not (found_valid_dataset or found_valid_attributes): raise ValueError('results should contain at least a sidpy Dataset or ' 'a dictionary in results') log_name = 'Log_' if process_name is not None: log_name = log_name + process_name log_group = create_indexed_group(h5_group, log_name) write_book_keeping_attrs(log_group) write_pynsid_book_keeping_attrs(log_group) if found_valid_dataset: for dset in dataset: write_nsid_dataset(dset, log_group) if found_valid_attributes: write_simple_attrs(log_group, flatten_dict(attributes)) return log_group
def create_results_group(h5_main, tool_name, h5_parent_group=None): """ Creates a h5py.Group object auto-indexed and named as 'DatasetName-ToolName_00x' Parameters ---------- h5_main : h5py.Dataset object Reference to the dataset based on which the process / analysis is being performed tool_name : string / unicode Name of the Process / Analysis applied to h5_main h5_parent_group : h5py.Group, optional. Default = None Parent group under which the results group will be created. Use this option to write results into a new HDF5 file. By default, results will be written into the same group containing `h5_main` Returns ------- h5_group : :class:`h5py.Group` Results group which can now house the results datasets """ # TODO: Revise significantly. Avoid parent dataset name # Consider embedding refs to source datasets as attributes of group warn( 'The behavior of create_results_group is very likely to change soon ' 'and significantly. Use this function with caution', FutureWarning) if not isinstance(h5_main, h5py.Dataset): raise TypeError('h5_main should be a h5py.Dataset object') if h5_parent_group is not None: if not isinstance(h5_parent_group, (h5py.File, h5py.Group)): raise TypeError("'h5_parent_group' should either be a h5py.File " "or h5py.Group object") else: h5_parent_group = h5_main.parent tool_name = validate_single_string_arg(tool_name, 'tool_name') if '-' in tool_name: warn( 'tool_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(tool_name, tool_name.replace('-', '_'))) tool_name = tool_name.replace('-', '_') group_name = h5_main.name.split('/')[-1] + '-' + tool_name + '_' group_name = assign_group_index(h5_parent_group, group_name) h5_group = h5_parent_group.create_group(group_name) write_book_keeping_attrs(h5_group) # Also add some basic attributes like source and tool name. This will allow relaxation of nomenclature restrictions: # this are NOT being used right now but will be in the subsequent versions of pyNSID write_simple_attrs(h5_group, {'tool': tool_name, 'num_source_dsets': 1}) # in this case, there is only one source if h5_parent_group.file == h5_main.file: for dset_ind, dset in enumerate([h5_main]): h5_group.attrs['source_' + '{:03d}'.format(dset_ind)] = dset.ref return h5_group