def link_h5_obj_as_alias(h5_main, h5_ancillary, alias_name): """ Creates Dataset attributes that contain references to other Dataset Objects. This function is useful when the reference attribute must have a reserved name. Such as linking 'SHO_Indices' as 'Spectroscopic_Indices' Parameters ------------ h5_main : h5py.Dataset Reference to the the object to which attributes will be added h5_ancillary : h5py.Dataset object whose reference that can be accessed from src.attrs alias_name : String Alias / alternate name for trg """ if not isinstance(h5_main, (h5py.Dataset, h5py.File, h5py.Group)): raise TypeError( 'h5_main should either be a h5py Dataset, File, or Group') if not isinstance(h5_ancillary, (h5py.Dataset, h5py.Group)): raise TypeError( 'h5_ancillary should be a h5py. Dataset or Group object') alias_name = validate_single_string_arg(alias_name, 'alias_name') __link_h5_obj(h5_main, h5_ancillary, alias=alias_name)
def __init__(self, file_path, *args, **kwargs): """ Parameters ----------- file_path : str Path to the file that needs to be read Attributes ---------- self._input_file_path : str Path to the file that will be read Notes ----- * This method will check to make sure that the provided file_path is indeed a string and a valid file path. * Consider calling ``can_read()`` within ``__init__()`` for validating the provided file Raises ------ FileNotFoundError """ file_path = validate_single_string_arg(file_path, 'file_path') if not os.path.exists(file_path): raise FileNotFoundError(file_path + ' does not exist') self._input_file_path = file_path
def find_dataset(h5_group, dset_name): """ Uses visit() to find all datasets with the desired name Parameters ---------- h5_group : :class:`h5py.Group` Group to search within for the Dataset dset_name : str Name of the dataset to search for Returns ------- datasets : list List of [Name, object] pairs corresponding to datasets that match `ds_name`. """ if not isinstance(h5_group, (h5py.File, h5py.Group)): raise TypeError('h5_group should be a h5py.File or h5py.Group object') dset_name = validate_single_string_arg(dset_name, 'dset_name') # print 'Finding all instances of', ds_name datasets = [] def __find_name(name, obj): if dset_name in name.split('/')[-1] and isinstance(obj, h5py.Dataset): datasets.append(obj) return h5_group.visititems(__find_name) return datasets
def flatten_dict(nested_dict, separator='-'): """ Flattens a nested dictionary Parameters ---------- nested_dict : dict Nested dictionary separator : str, Optional. Default='-' Separator between the keys of different levels Returns ------- dict Dictionary whose keys are flattened to a single level Notes ----- Taken from https://stackoverflow.com/questions/6027558/flatten-nested- dictionaries-compressing-keys """ if not isinstance(nested_dict, dict): raise TypeError('nested_dict should be a dict') separator = validate_single_string_arg(separator, 'separator') def __flatten_dict_int(nest_dict, sep, parent_key=''): items = [] if sep == '_': repl = '-' else: repl = '_' for key, value in nest_dict.items(): if not isinstance(key, str): key = str(key) if sep in key: key = key.replace(sep, repl) new_key = parent_key + sep + key if parent_key else key if isinstance(value, MutableMapping): items.extend( __flatten_dict_int(value, sep, parent_key=new_key).items()) # nion files contain lists of dictionaries, oops elif isinstance(value, list): for i in range(len(value)): if isinstance(value[i], dict): for kk in value[i]: items.append( ('dim-' + kk + '-' + str(i), value[i][kk])) else: if type(value) != bytes: items.append((new_key, value)) else: if type(value) != bytes: items.append((new_key, value)) return dict(items) return __flatten_dict_int(nested_dict, separator)
def create_results_group(h5_main, tool_name, h5_parent_group=None): """ Creates a h5py.Group object autoindexed and named as 'DatasetName-ToolName_00x' Parameters ---------- h5_main : h5py.Dataset object Reference to the dataset based on which the process / analysis is being performed tool_name : string / unicode Name of the Process / Analysis applied to h5_main h5_parent_group : h5py.Group, optional. Default = None Parent group under which the results group will be created. Use this option to write results into a new HDF5 file. By default, results will be written into the same group containing `h5_main` Returns ------- h5_group : :class:`h5py.Group` Results group which can now house the results datasets """ if not isinstance(h5_main, h5py.Dataset): raise TypeError('h5_main should be a h5py.Dataset object') if h5_parent_group is not None: if not isinstance(h5_parent_group, (h5py.File, h5py.Group)): raise TypeError("'h5_parent_group' should either be a h5py.File " "or h5py.Group object") else: h5_parent_group = h5_main.parent tool_name = validate_single_string_arg(tool_name, 'tool_name') if '-' in tool_name: warn('tool_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(tool_name, tool_name.replace('-', '_'))) tool_name = tool_name.replace('-', '_') group_name = h5_main.name.split('/')[-1] + '-' + tool_name + '_' group_name = assign_group_index(h5_parent_group, group_name) h5_group = h5_parent_group.create_group(group_name) write_book_keeping_attrs(h5_group) # Also add some basic attributes like source and tool name. This will allow relaxation of nomenclature restrictions: # this are NOT being used right now but will be in the subsequent versions of pyUSID write_simple_attrs(h5_group, {'tool': tool_name, 'num_source_dsets': 1}) # in this case, there is only one source if h5_parent_group.file == h5_main.file: for dset_ind, dset in enumerate([h5_main]): h5_group.attrs['source_' + '{:03d}'.format(dset_ind)] = dset.ref return h5_group
def write_dict_to_h5_group(h5_group, metadata, group_name): """ If the provided metadata parameter is a non-empty dictionary, this function will create a HDF5 group called group_name within the provided h5_group and write the contents of metadata into the newly created group Parameters ---------- h5_group : h5py.Group Parent group to write metadata into metadata : dict Dictionary that needs to be written into the group group_name : str Name of the group to write attributes into Returns ------- h5_metadata_grp : h5py.Group Handle to the newly create group containing the metadata Notes ----- Nested dictionaries will be flattened until sidpy implements functions to write and read nested dictionaries to and from HDF5 files """ if not isinstance(metadata, dict): raise TypeError('metadata is not a dict but of type: {}' ''.format(type(metadata))) if len(metadata) < 1: return None if not isinstance(h5_group, (h5py.Group, h5py.File)): raise TypeError('h5_group is neither a h5py.Group or h5py.File object' 'and is of type: {}'.format(type(h5_group))) validate_single_string_arg(group_name, 'group_name') group_name = group_name.replace(' ', '_') h5_md_group = h5_group.create_group(group_name) flat_dict = flatten_dict(metadata) write_simple_attrs(h5_md_group, flat_dict) return h5_md_group
def is_valid_file(file_path, *args, **kwargs): """ Checks whether the provided file can be read by this translator. This basic function compares the file extension against the "extension" keyword argument. If the extension matches, this function returns True Parameters ---------- file_path : str Path to raw data file Returns ------- file_path : str Path to the file that needs to be provided to translate() if the provided file was indeed a valid file Else, None """ file_path = validate_single_string_arg(file_path, 'file_name') if not os.path.exists(file_path): raise FileNotFoundError(file_path + ' does not exist') targ_ext = kwargs.get('extension', None) if not targ_ext: raise NotImplementedError('Either is_valid_file() has not been ' 'implemented by this translator or the ' '"extension" keyword argument was ' 'missing') if isinstance(targ_ext, (str, unicode)): targ_ext = [targ_ext] targ_ext = validate_list_of_strings(targ_ext, parm_name='(keyword argument) ' '"extension"') # Get rid of any '.' separators that may be in the list of extensions # Also turn to lower case for case insensitive comparisons targ_ext = [item.replace('.', '').lower() for item in targ_ext] file_path = os.path.abspath(file_path) extension = os.path.splitext(file_path)[1][1:] # Ensure extension is lower case just like targets above extension = extension.lower() if extension in targ_ext: return file_path else: return None
def dimension_type(self, value): if isinstance(value, DimensionTypes): self._dimension_type = value else: dimension_type = validate_single_string_arg( value, 'dimension_type') if dimension_type.upper() in DimensionTypes._member_names_: self._dimension_type = DimensionTypes[dimension_type.upper()] elif dimension_type.lower() in ['frame', 'time', 'stack']: self._dimension_type = DimensionTypes.TEMPORAL else: self._dimension_type = DimensionTypes.UNKNOWN print('Supported dimension_types for plotting are only: ', DimensionTypes._member_names_) print('Setting DimensionTypes to UNKNOWN')
def assign_group_index(h5_parent_group, base_name, verbose=False): """ Searches the parent h5 group to find the next available index for the group Parameters ---------- h5_parent_group : :class:`h5py.Group` object Parent group under which the new group object will be created base_name : str or unicode Base name of the new group without index verbose : bool, optional. Default=False Whether or not to print debugging statements Returns ------- base_name : str or unicode Base name of the new group with the next available index as a suffix """ if not isinstance(h5_parent_group, h5py.Group): raise TypeError('h5_parent_group should be a h5py.Group object') base_name = validate_single_string_arg(base_name, 'base_name') if len(base_name) == 0: raise ValueError('base_name should not be an empty string') if not base_name.endswith('_'): base_name += '_' temp = [key for key in h5_parent_group.keys()] if verbose: print( 'Looking for group names starting with {} in parent containing items: ' '{}'.format(base_name, temp)) previous_indices = [] for item_name in temp: if isinstance(h5_parent_group[item_name], h5py.Group) and item_name.startswith(base_name): previous_indices.append(int(item_name.replace(base_name, ''))) previous_indices = np.sort(previous_indices) if verbose: print('indices of existing groups with the same prefix: {}'.format( previous_indices)) if len(previous_indices) == 0: index = 0 else: index = previous_indices[-1] + 1 return base_name + '{:03d}'.format(index)
def __init__(self, name, units, values, mode=DimType.DEFAULT): """ Simple object that describes a dimension in a dataset by its name, units, and values Parameters ---------- name : str or unicode Name of the dimension. For example 'Bias' units : str or unicode Units for this dimension. For example: 'V' values : array-like or int Values over which this dimension was varied. A linearly increasing set of values will be generated if an integer is provided instead of an array. mode : Enum, Optional. Default = DimType.DEFAULT How the parameter associated with the dimension was varied. DimType.DEFAULT - data was recorded for all combinations of values in this dimension against **all** other dimensions. This is typically the case. DimType.INCOMPLETE - Data not present for all combinations of values in this dimension and all other dimensions. Examples include spiral scans, sparse sampling, aborted measurements DimType.DEPENDENT - Values in this dimension were varied as a function of another (independent) dimension. """ name = validate_single_string_arg(name, 'name') if not isinstance(units, (str, unicode)): raise TypeError('units should be a string') units = units.strip() if isinstance(values, int): if values < 1: raise ValueError( 'values should at least be specified as a positive integer' ) values = np.arange(values) if not isinstance(values, (np.ndarray, list, tuple)): raise TypeError('values should be array-like') if not isinstance(mode, DimType): raise TypeError( 'mode must be of type pyUSID.DimType. Provided object was of type: {}' .format(type(mode))) self.name = name self.units = units self.values = values self.mode = mode
def dimension_type(self, value): if isinstance(value, DimensionType): self._dimension_type = value else: dimension_type = validate_single_string_arg( value, 'dimension_type') if dimension_type.upper() in [ member.name for member in DimensionType ]: self._dimension_type = DimensionType[dimension_type.upper()] elif dimension_type.lower() in ['frame', 'time', 'stack']: self._dimension_type = DimensionType.TEMPORAL else: self._dimension_type = DimensionType.UNKNOWN warn('Supported dimension types for plotting are only: {}' ''.format([member.name for member in DimensionType])) warn('Setting DimensionType to UNKNOWN')
def find_results_groups(h5_main, tool_name, h5_parent_group=None): """ Finds a list of all groups containing results of the process of name ``tool_name`` being applied to the dataset Parameters ---------- h5_main : h5 dataset reference Reference to the target dataset to which the tool was applied tool_name : String / unicode Name of the tool applied to the target dataset h5_parent_group : h5py.Group, optional. Default = None Parent group under which the results group will be searched for. Use this option when the results groups are contained in different HDF5 file compared to `h5_main`. BY default, this function will search within the same group that contains `h5_main` Returns ------- groups : list of references to :class:`h5py.Group` objects groups whose name contains the tool name and the dataset name """ warn( 'The behavior of find_results_group is very likely to change soon ' 'and significantly. Use this function with caution', FutureWarning) if not isinstance(h5_main, h5py.Dataset): raise TypeError('h5_main should be a h5py.Dataset object') tool_name = validate_single_string_arg(tool_name, 'tool_name') if h5_parent_group is not None: if not isinstance(h5_parent_group, (h5py.File, h5py.Group)): raise TypeError("'h5_parent_group' should either be a h5py.File " "or h5py.Group object") else: h5_parent_group = h5_main.parent dset_name = h5_main.name.split('/')[-1] groups = [] for key in h5_parent_group.keys(): if dset_name in key and tool_name in key and isinstance( h5_parent_group[key], h5py.Group): groups.append(h5_parent_group[key]) return groups
def get_attr(h5_object, attr_name): """ Returns the attribute from the h5py object Parameters ---------- h5_object : :class:`h5py.Dataset`, :class:`h5py.Group` or :class:`h5py.File` object whose attribute is desired attr_name : str Name of the attribute of interest Returns ------- att_val : object value of attribute, in certain cases (byte strings or list of byte strings) reformatted to readily usable forms """ if not isinstance(h5_object, (h5py.Dataset, h5py.Group, h5py.File)): raise TypeError( 'h5_object should be a h5py.Dataset, h5py.Group or h5py.File object' ) attr_name = validate_single_string_arg(attr_name, 'attr_name') if attr_name not in h5_object.attrs.keys(): raise KeyError("'{}' is not an attribute in '{}'".format( attr_name, h5_object.name)) h5py_major = int(h5py.__version__.split('.')[0]) att_val = h5_object.attrs.get(attr_name) if isinstance(att_val, np.bytes_) or isinstance(att_val, bytes): att_val = att_val.decode('utf-8') elif isinstance(att_val, np.ndarray): if sys.version_info.major == 3: if att_val.dtype.type in [np.bytes_]: att_val = np.array([str(x, 'utf-8') for x in att_val]) elif att_val.dtype.type in [np.object_] and h5py_major < 3: att_val = np.array([str(x, 'utf-8') for x in att_val]) return att_val
def __check_anc_before_creation(aux_prefix, dim_type='pos'): aux_prefix = validate_single_string_arg(aux_prefix, 'aux_' + dim_type + '_prefix') if not aux_prefix.endswith('_'): aux_prefix += '_' if '-' in aux_prefix: warn( 'aux_' + dim_type + ' should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(aux_prefix, aux_prefix.replace('-', '_'))) aux_prefix = aux_prefix.replace('-', '_') for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']: if dset_name in h5_parent_group.keys(): # TODO: What if the contained data was correct? raise KeyError( 'Dataset named: ' + dset_name + ' already exists in group: ' '{}. Consider passing these datasets using kwargs (if they are correct) instead of providing the pos_dims and spec_dims arguments' .format(h5_parent_group.name)) return aux_prefix
def create_indexed_group(h5_parent_group, base_name): """ Creates a group with an indexed name (eg - 'Measurement_012') under ``h5_parent_group`` using the provided ``base_name`` as a prefix for the group's name Parameters ---------- h5_parent_group : :class:`h5py.Group` or :class:`h5py.File` File or group within which the new group will be created base_name : str or unicode Prefix for the group name. This need not end with a '_'. It will be added automatically """ if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_parent_group should be a h5py.File or Group object') base_name = validate_single_string_arg(base_name, 'base_name') group_name = assign_group_index(h5_parent_group, base_name) h5_new_group = h5_parent_group.create_group(group_name) write_book_keeping_attrs(h5_new_group) return h5_new_group
def __init__(self, h5_main, process_name, parms_dict=None, cores=None, max_mem_mb=4 * 1024, mem_multiplier=1.0, lazy=False, h5_target_group=None, verbose=False): """ Parameters ---------- h5_main : :class:`~pyUSID.io.usi_data.USIDataset` The USID main HDF5 dataset over which the analysis will be performed. process_name : str Name of the process cores : uint, optional How many cores to use for the computation. Default: all available cores - 2 if operating outside MPI context max_mem_mb : uint, optional How much memory to use for the computation. Default 1024 Mb mem_multiplier : float, optional. Default = 1 mem_multiplier is the number that will be multiplied with the (byte) size of a single position in the source dataset in order to better estimate the number of positions that can be processed at any given time (how many pixels of the source and results datasets can be retained in memory). The default value of 1.0 only accounts for the source dataset. A value greater than 1 would account for the size of results datasets as well. For example, if the result dataset is the same size and precision as the source dataset, the multiplier will be 2 (1 for source, 1 for result) lazy : bool, optional. Default = False If True, read_data_chunk and write_results_chunk will operate on dask arrays. If False - everything will be in numpy. h5_target_group : h5py.Group, optional. Default = None Location where to look for existing results and to place newly computed results. Use this kwarg if the results need to be written to a different HDF5 file. By default, this value is set to the parent group containing `h5_main` verbose : bool, Optional, default = False Whether or not to print debugging statements Attributes ---------- self.h5_results_grp : :class:`h5py.Group` HDF5 group containing the HDF5 datasets that contain the results of the computation self.verbose : bool Whether or not to print debugging statements self.parms_dict : dict Dictionary of parameters for the computation self.duplicate_h5_groups : list List of :class:`h5py.Group` objects containing computational results that have been completely computed with the same set of parameters as those in self.parms_dict self.partial_h5_groups : list List of :class:`h5py.Group` objects containing computational results that have been partially computed with the same set of parameters as those in self.parms_dict self.process_name : str Name of the process. This is used for checking for existing completely and partially computed results as well as for naming the HDF5 group that will contain the results of the computation self._cores : uint Number of CPU cores to use for parallel computations. Ignored in the MPI context. Each rank gets 1 CPU core self._max_pos_per_read : uint Number of positions in the dataset to read per chunk self._status_dset_name : str Name of the HDF5 dataset that keeps track of the positions in the source dataset thave already been computed self._results : list List of objects returned as the result of computation performed by the self._map_function for each position in the current batch of positions that were processed self._h5_target_group : h5py.Group Location where existing / future results will be stored self.__resume_implemented : bool Whether or not this (child) class has implemented the self._get_existing_datasets() function self.__bytes_per_pos : uint Number of bytes used by one position of the source dataset self.mpi_comm : :class:`mpi4py.MPI.COMM_WORLD` MPI communicator. None if not running in an MPI context self.mpi_rank: uint MPI rank. Always 0 if not running in an MPI context self.mpi_size: uint Number of ranks in COMM_WORLD. 1 if not running in an MPI context self.__ranks_on_socket : uint Number of MPI ranks on a given CPU socket self.__socket_master_rank : uint Master MPI rank for a given CPU chip / socket self.__compute_jobs : array-like List of positions in the HDF5 dataset that need to be computed. This may not be a continuous list of numbers if multiple MPI workers had previously started computing and were interrupted. self.__start_pos : uint The index within self.__compute_jobs that a particular MPI rank / worker needs to start computing from. self.__rank_end_pos : uint The index within self.__compute_jobs that a particular MPI rank / worker needs to start computing till. self.__end_pos : uint The index within self.__compute_jobs that a particular MPI rank / worker needs to start computing till for the current batch of positions. self.__pixels_in_batch : array-like The positions being computed on by the current compute worker """ if h5_main.file.mode != 'r+': raise TypeError( 'Need to ensure that the file is in r+ mode to write results back to the file' ) MPI = get_MPI() # Ensure that the file is opened in the correct comm or something if MPI is not None and h5_main.file.driver != 'mpio': warn('Code was called in MPI context but HDF5 file was not opened ' 'with the "mpio" driver. JobLib will be used instead of MPI ' 'for parallel computation') MPI = None if MPI is not None: # If we came here then, the user has intentionally asked for multi-node computation comm = MPI.COMM_WORLD self.mpi_comm = comm self.mpi_rank = comm.Get_rank() self.mpi_size = comm.Get_size() if verbose: print( "Rank {} of {} on {} sees {} logical cores on the socket". format(comm.Get_rank(), comm.Get_size(), MPI.Get_processor_name(), cpu_count())) # First, ensure that cores=logical cores in node. No point being economical / considerate cores = psutil.cpu_count() # It is sufficient if just one rank checks all this. if self.mpi_rank == 0: print('Working on {} ranks via MPI'.format(self.mpi_size)) if verbose and self.mpi_rank == 0: print('Finished getting all necessary MPI information') """ # Not sure how to check for this correctly messg = None try: if h5_main.file.comm != comm: messg = 'The HDF5 file should have been opened with comm=MPI.COMM_WORLD. Currently comm={}' ''.format(h5_main.file.comm) except AttributeError: messg = 'The HDF5 file should have been opened with comm=MPI.COMM_WORLD' if messg is not None: raise TypeError(messg) """ else: if verbose: print( 'No mpi4py found or script was not called via mpixexec / mpirun. ' 'Assuming single node computation') self.mpi_comm = None self.mpi_size = 1 self.mpi_rank = 0 # Checking if dataset is "Main" if not check_if_main(h5_main, verbose=verbose and self.mpi_rank == 0): raise ValueError( 'Provided dataset is not a "Main" dataset with necessary ancillary datasets' ) if h5_target_group is not None: if not isinstance(h5_target_group, (h5py.Group, h5py.File)): raise TypeError( "'h5_target_group' must be a h5py.Group object") else: h5_target_group = h5_main.parent self._h5_target_group = h5_target_group process_name = validate_single_string_arg(process_name, 'process_name') if parms_dict is None: parms_dict = {} else: if not isinstance(parms_dict, dict): raise TypeError("Expected 'parms_dict' of type: dict") if MPI is not None: MPI.COMM_WORLD.barrier() # Not sure if we need a barrier here. if verbose and self.mpi_rank == 0: print( 'Rank {}: Upgrading from a regular h5py.Dataset to a USIDataset' .format(self.mpi_rank)) # Generation of N-dimensional form would break things for some reason. self.h5_main = USIDataset(h5_main) if verbose and self.mpi_rank == 0: print('Rank {}: The HDF5 dataset is now a USIDataset'.format( self.mpi_rank)) # Saving these as properties of the object: self.verbose = verbose self.__lazy = lazy self._cores = None self.__ranks_on_socket = 1 self.__socket_master_rank = 0 self._max_pos_per_read = None self.__bytes_per_pos = None # Now have to be careful here since the below properties are a function of the MPI rank self.__start_pos = None self.__rank_end_pos = None self.__end_pos = None self.__pixels_in_batch = None self.__compute_jobs = None # Determining the max size of the data that can be put into memory # all ranks go through this and they need to have this value any self._set_memory_and_cores(cores=cores, man_mem_limit=max_mem_mb, mem_multiplier=mem_multiplier) if verbose and self.mpi_rank == 0: print('Finished collecting info on memory and workers') self.duplicate_h5_groups = [] self.partial_h5_groups = [] self.process_name = process_name # Reset this in the extended classes self.parms_dict = parms_dict """ The name of the HDF5 dataset that should be present to signify which positions have already been computed This is NOT a fully private variable so that multiple processes can be run within a single group - Eg Fitter In the case of Fitter - this name can be changed from 'completed_guesses' to 'completed_fits' check_for_duplicates will be called by the Child class where they have the opportunity to change this variable before checking for duplicates """ self._status_dset_name = 'completed_positions' self._results = None self.h5_results_grp = None # Check to see if the resuming feature has been implemented: self.__resume_implemented = False try: self._get_existing_datasets() except NotImplementedError: if verbose and self.mpi_rank == 0: print( 'It appears that this class may not be able to resume computations' ) except: # NameError for variables that don't exist # AttributeError for self.var_name that don't exist # TypeError (NoneType) etc. self.__resume_implemented = True if self.mpi_rank == 0: print( 'Consider calling test() to check results before calling compute() which computes on the entire' ' dataset and writes results to the HDF5 file') self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates( )
def quantity(self, value): self._quantity = validate_single_string_arg(value, 'quantity')
def write_ind_val_dsets(h5_parent_group, dimensions, is_spectral=True, verbose=False, base_name=None, slow_to_fast=False): """ Creates h5py.Datasets for the position OR spectroscopic indices and values of the data. Remember that the contents of the dataset can be changed if need be after the creation of the datasets. For example if one of the spectroscopic dimensions (e.g. - Bias) was sinusoidal and not linear, The specific dimension in the Spectroscopic_Values dataset can be manually overwritten. Parameters ---------- h5_parent_group : :class:`h5py.Group` or :class:`h5py.File` Group under which the indices and values datasets will be created dimensions : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets is_spectral : bool, optional. default = True Spectroscopic (True) or Position (False) verbose : Boolean, optional Whether or not to print statements for debugging purposes base_name : str or unicode, optional Prefix for the datasets. Default: 'Position' when is_spectral is False, 'Spectroscopic' otherwise slow_to_fast : bool, Optional. Default=False Set to True if the dimensions are arranged from slowest varying to fastest varying. Set to False otherwise. Returns ------- h5_spec_inds : h5py.Dataset Dataset containing the position indices h5_spec_vals : h5py.Dataset Dataset containing the value at each position Notes ----- `steps`, `initial_values`, `labels`, and 'units' must be the same length as `dimensions` when they are specified. Dimensions should be in the order from fastest varying to slowest. """ if isinstance(dimensions, Dimension): dimensions = [dimensions] if not isinstance(dimensions, (list, np.ndarray, tuple)): raise TypeError('dimensions should be array-like ') if not np.all([isinstance(x, Dimension) for x in dimensions]): raise TypeError('dimensions should be a sequence of Dimension objects') if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError('h5_parent_group should be a h5py.File or Group object') if not is_editable_h5(h5_parent_group): raise ValueError('The provided h5 object is not valid / open') if base_name is not None: base_name = validate_single_string_arg(base_name, 'base_name') if not base_name.endswith('_'): base_name += '_' else: base_name = 'Position_' if is_spectral: base_name = 'Spectroscopic_' if not slow_to_fast: warn('In the future write_ind_val_dsets will default to requiring dimensions to be arranged from slowest to fastest varying') # check if the datasets already exist. If they do, there's no point in going any further for sub_name in ['Indices', 'Values']: if base_name + sub_name in h5_parent_group.keys(): raise KeyError('Dataset: {} already exists in provided group: {}'.format(base_name + sub_name, h5_parent_group.name)) modes = [dim.mode for dim in dimensions] sing_mode = np.unique(modes) if sing_mode.size > 1: raise NotImplementedError('Cannot yet work on combinations of modes for Dimensions. Consider doing manually') sing_mode = sing_mode[0] if sing_mode == DimType.DEFAULT: if slow_to_fast: # Ensure that the dimensions are arranged from fast to slow instead dimensions = dimensions[::-1] indices, values = build_ind_val_matrices([dim.values for dim in dimensions], is_spectral=is_spectral) # At this point, dimensions and unit values are arranged from fastest to slowest # We want dimensions to be arranged from slowest to fastest: rev_func = np.flipud if is_spectral else np.fliplr dimensions = dimensions[::-1] indices = rev_func(indices) values = rev_func(values) elif sing_mode == DimType.INCOMPLETE: lengths = np.unique([len(dim.values) for dim in dimensions]) if len(lengths) > 1: raise ValueError('Values for dimensions not of same length') single_dim = np.arange(lengths[0], dtype=INDICES_DTYPE) indices = np.tile(single_dim, (2, 1)).T values = np.dstack(tuple([dim.values for dim in dimensions])).squeeze() if is_spectral: indices = indices.T values = values.T else: raise NotImplementedError('Cannot yet work on Dependent dimensions') if verbose: print('Indices:') print(indices) print('Values:') print(values) # Create the Datasets for both Indices and Values h5_indices = h5_parent_group.create_dataset(base_name + 'Indices', data=INDICES_DTYPE(indices), dtype=INDICES_DTYPE) h5_values = h5_parent_group.create_dataset(base_name + 'Values', data=VALUES_DTYPE(values), dtype=VALUES_DTYPE) for h5_dset in [h5_indices, h5_values]: write_simple_attrs(h5_dset, {'units': [x.units for x in dimensions], 'labels': [x.name for x in dimensions], 'type': [dim.mode.value for dim in dimensions]}) warn('pyUSID.io.hdf_utils.simple.write_ind_val_dsets no longer creates' 'region references for each dimension. Please use ' 'pyUSID.io.reg_ref.write_region_references to manually create region ' 'references') return h5_indices, h5_values
def name(self, value): self._name = validate_single_string_arg(value, 'name')
def units(self, value): self._units = validate_single_string_arg(value, 'units')
def write_reduced_anc_dsets(h5_parent_group, h5_inds, h5_vals, dim_name, basename=None, is_spec=None, verbose=False): """ Creates new Ancillary Indices and Values datasets from the input datasets by dropping the specified dimensions Parameters ---------- h5_parent_group : :class:`h5py.Group` or h5py.File Group under which the indices and values datasets will be created h5_inds : HDF5 Dataset Spectroscopic or Positions indices dataset h5_vals : HDF5 Dataset Spectroscopic or Positions values dataset dim_name : str or unicode or list of strings Names of the dimension(s) to remove basename : str or unicode, Optional String to which '_Indices' and '_Values' will be appended to get the names of the new datasets. Default = 'Position' or 'Spectroscopic' is_spec : bool, optional Whether or not the provided ancillary datasets are position or spectroscopic The user is recommended to supply this parameter whenever it is known or possible. By default, this function will attempt to recognize the answer based on the shape of the datasets. verbose : bool, optional. Default = False Whether or not to print debugging print statements Returns ------- h5_inds_new : h5py.Dataset Reduced indices dataset h5_vals_new : h5py.Dataset Reduces values dataset """ if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError('h5_parent_group should either be a h5py. Group or File object') for param, param_name in zip([h5_inds, h5_vals], ['h5_inds', 'h5_vals']): if not isinstance(param, h5py.Dataset): raise TypeError(param_name + ' should be a h5py.Dataset object') if dim_name is not None: dim_name = validate_list_of_strings(dim_name, 'dim_name') all_dim_names = list(get_attr(h5_inds, 'labels')) for item in dim_name: if item not in all_dim_names: raise KeyError('Requested dimension: {} not in the list of labels: {}'.format(item, all_dim_names)) ind_mat = h5_inds[()] val_mat = h5_vals[()] if is_spec is None: # Attempt to recognize the type automatically is_spec = False if ind_mat.shape[0] == ind_mat.shape[1]: raise ValueError('Unable automatically guess whether the provided datasets are position or ' 'spectroscopic. Please explicitely specify via the "is_spec" boolean kwarg') if ind_mat.shape[0] < ind_mat.shape[1]: is_spec = True else: if not isinstance(is_spec, bool): raise TypeError('is_spec should be a boolean. Provided object is of type: {}'.format(type(is_spec))) if basename is not None: basename = validate_single_string_arg(basename, 'basename') if basename.endswith('_'): basename = basename[:-1] else: if is_spec: basename = 'Spectroscopic' else: basename = 'Position' for sub_name in ['_Indices', '_Values']: if basename + sub_name in h5_parent_group.keys(): raise KeyError('Dataset: {} already exists in provided group: {}'.format(basename + sub_name, h5_parent_group.name)) if set(dim_name) != set(all_dim_names): # At least one dimension will remain if verbose: print('All Dimensions: {}. Dimensions to be removed: {}'.format(all_dim_names, dim_name)) if not is_spec: # Convert to spectral shape ind_mat = np.transpose(ind_mat) val_mat = np.transpose(val_mat) # For all dimensions, find where the index = 0 # basically, we are indexing all dimensions to 0 first_indices = [] keep_dim = np.ones(len(all_dim_names), dtype=bool) for cur_dim in dim_name: dim_ind = all_dim_names.index(cur_dim) keep_dim[dim_ind] = False # check equality against the minimum value instead of 0 to account for cases when a dimension does not start # from 0 (already been sliced) - think of multi-dimensional slicing! first_indices.append(ind_mat[dim_ind] == np.min(ind_mat[dim_ind])) first_indices = np.vstack(first_indices) if verbose: print('Raw first_indices:') print(first_indices) print('Dimensions to keep: {}'.format(keep_dim)) step_starts = np.all(first_indices, axis=0) if verbose: print('Columns in dataset to keep:') print(step_starts) ''' Extract all rows that we want to keep from input indices and values ''' # TODO: handle TypeError: Indexing elements must be in increasing order ind_mat = ind_mat[keep_dim, :][:, step_starts] val_mat = val_mat[keep_dim, :][:, step_starts] if not is_spec: # Convert back to position shape ind_mat = np.transpose(ind_mat) val_mat = np.transpose(val_mat) ''' Create new Datasets to hold the data Name them based on basename ''' h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=ind_mat, dtype=h5_inds.dtype) h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=val_mat, dtype=h5_vals.dtype) # Extracting the labels from the original spectroscopic data sets labels = h5_inds.attrs['labels'][keep_dim] # Creating the dimension slices for the new spectroscopic data sets # Adding the labels and units to the new spectroscopic data sets for dset in [h5_inds_new, h5_vals_new]: write_simple_attrs(dset, {'labels': labels, 'units': h5_inds.attrs['units'][keep_dim]}) else: # Remove all dimensions: h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=np.array([[0]]), dtype=INDICES_DTYPE) h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=np.array([[0]]), dtype=VALUES_DTYPE) for dset in [h5_inds_new, h5_vals_new]: write_simple_attrs(dset, {'labels': ['Single_Step'], 'units': ['a. u.']}) return h5_inds_new, h5_vals_new
def copy_dataset(h5_orig_dset, h5_dest_grp, alias=None, verbose=False): """ Copies the provided HDF5 dataset to the provided destination. This function is handy when needing to make copies of datasets to a different HDF5 file. Notes ----- This function does NOT copy all linked objects such as ancillary datasets. Call `copy_linked_objects` to accomplish that goal. Parameters ---------- h5_orig_dset : h5py.Dataset h5_dest_grp : h5py.Group or h5py.File object : Destination where the duplicate dataset will be created alias : str, optional. Default = name from `h5_orig_dset`: Name to be assigned to the copied dataset verbose : bool, optional. Default = False Whether or not to print logs to assist in debugging Returns ------- """ if not isinstance(h5_orig_dset, h5py.Dataset): raise TypeError("'h5_orig_dset' should be a h5py.Dataset object") if not isinstance(h5_dest_grp, (h5py.File, h5py.Group)): raise TypeError("'h5_dest_grp' should either be a h5py.File or " "h5py.Group object") if alias is not None: validate_single_string_arg(alias, 'alias') else: alias = h5_orig_dset.name.split('/')[-1] if alias in h5_dest_grp.keys(): if verbose: warn('{} already contains an object with the same name: {}' ''.format(h5_dest_grp, alias)) h5_new_dset = h5_dest_grp[alias] if not isinstance(h5_new_dset, h5py.Dataset): raise TypeError( '{} already contains an object: {} with the desired' ' name which is not a dataset'.format(h5_dest_grp, h5_new_dset)) da_source = lazy_load_array(h5_orig_dset) da_dest = lazy_load_array(h5_new_dset) if da_source.shape != da_dest.shape: raise ValueError('Existing dataset: {} has a different shape ' 'compared to the original dataset: {}' ''.format(h5_new_dset, h5_orig_dset)) if not da.allclose(da_source, da_dest): raise ValueError('Existing dataset: {} has different contents' 'compared to the original dataset: {}' ''.format(h5_new_dset, h5_orig_dset)) else: kwargs = { 'shape': h5_orig_dset.shape, 'dtype': h5_orig_dset.dtype, 'compression': h5_orig_dset.compression, 'chunks': h5_orig_dset.chunks } if h5_orig_dset.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn('This HDF5 file has been opened wth the ' '"mpio" communicator. mpi4py does not allow ' 'creation of compressed datasets. Compression' ' kwarg has been removed') if verbose: print('Creating new HDF5 dataset named: {} at: {} with' ' kwargs: {}'.format(alias, h5_dest_grp, kwargs)) h5_new_dset = h5_dest_grp.create_dataset(alias, **kwargs) if verbose: print('dask.array will copy data from source dataset ' 'to new dataset') da.to_hdf5(h5_new_dset.file.filename, {h5_new_dset.name: lazy_load_array(h5_orig_dset)}) if verbose: print('Copying simple attributes of original dataset: {} to ' 'destination dataset: {}'.format(h5_orig_dset, h5_new_dset)) copy_attributes(h5_orig_dset, h5_new_dset, skip_refs=True) # TODO: reinstate copy all region_refs() # copy_all_region_refs(h5_orig_dset, h5_new_dset) return h5_new_dset
def check_for_old(h5_base, tool_name, new_parms=None, target_dset=None, h5_parent_goup=None, verbose=False): """ Check to see if the results of a tool already exist and if they were performed with the same parameters. Parameters ---------- h5_base : h5py.Dataset object Dataset on which the tool is being applied to tool_name : str process or analysis name new_parms : dict, optional Parameters with which this tool will be performed. target_dset : str, optional, default = None Name of the dataset whose attributes will be compared against new_parms. Default - checking against the group h5_parent_goup : h5py.Group, optional. Default = None The group to search under. Use this option when `h5_base` and the potential results groups (within `h5_parent_goup` are located in different HDF5 files. Default - search within h5_base.parent verbose : bool, optional, default = False Whether or not to print debugging statements Returns ------- group : list List of all :class:`h5py.Group` objects with parameters matching those in `new_parms` """ if not isinstance(h5_base, h5py.Dataset): raise TypeError('h5_base should be a h5py.Dataset object') tool_name = validate_single_string_arg(tool_name, 'tool_name') if h5_parent_goup is not None: if not isinstance(h5_parent_goup, (h5py.File, h5py.Group)): raise TypeError("'h5_parent_group' should either be a h5py.File " "or h5py.Group object") else: h5_parent_goup = h5_base.parent if new_parms is None: new_parms = dict() else: if not isinstance(new_parms, dict): raise TypeError('new_parms should be a dict') if target_dset is not None: target_dset = validate_single_string_arg(target_dset, 'target_dset') matching_groups = [] groups = find_results_groups(h5_base, tool_name, h5_parent_group=h5_parent_goup) for group in groups: if verbose: print('Looking at group - {}'.format(group.name.split('/')[-1])) h5_obj = group if target_dset is not None: if target_dset in group.keys(): h5_obj = group[target_dset] else: if verbose: print('{} did not contain the target dataset: {}'.format(group.name.split('/')[-1], target_dset)) continue if check_for_matching_attrs(h5_obj, new_parms=new_parms, verbose=verbose): # return group matching_groups.append(group) return matching_groups
def create_empty_dataset(source_dset, dtype, dset_name, h5_group=None, new_attrs=None, skip_refs=False): """ Creates an empty dataset in the h5 file based on the provided dataset in the same or specified group Parameters ---------- source_dset : h5py.Dataset object Source object that provides information on the group and shape of the dataset dtype : dtype Data type of the fit / guess datasets dset_name : String / Unicode Name of the dataset h5_group : :class:`h5py.Group`, optional. Default = None Group within which this dataset will be created new_attrs : dictionary (Optional) Any new attributes that need to be written to the dataset skip_refs : boolean, optional Should ObjectReferences be skipped when copying attributes from the `source_dset` Returns ------- h5_new_dset : h5py.Dataset object Newly created dataset """ if not isinstance(source_dset, h5py.Dataset): raise TypeError('source_deset should be a h5py.Dataset object') _ = validate_dtype(dtype) if new_attrs is not None: if not isinstance(new_attrs, dict): raise TypeError('new_attrs should be a dictionary') else: new_attrs = dict() if h5_group is None: h5_group = source_dset.parent else: if not isinstance(h5_group, (h5py.Group, h5py.File)): raise TypeError('h5_group should be a h5py.Group or h5py.File object') if source_dset.file != h5_group.file and not skip_refs: # Cannot carry over references warn('H5 object references will not be copied over since {} is in ' 'a different HDF5 file as {}'.format(h5_group, source_dset)) skip_refs = True dset_name = validate_single_string_arg(dset_name, 'dset_name') if '-' in dset_name: warn('dset_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(dset_name, dset_name.replace('-', '_'))) dset_name = dset_name.replace('-', '_') kwargs = {'shape': source_dset.shape, 'dtype': dtype, 'compression': source_dset.compression, 'chunks': source_dset.chunks} if source_dset.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn('This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed') if dset_name in h5_group.keys(): if isinstance(h5_group[dset_name], h5py.Dataset): warn('A dataset named: {} already exists in group: {}'.format(dset_name, h5_group.name)) h5_new_dset = h5_group[dset_name] # Make sure it has the correct shape and dtype if any((source_dset.shape != h5_new_dset.shape, dtype != h5_new_dset.dtype)): warn('Either the shape (existing: {} desired: {}) or dtype (existing: {} desired: {}) of the dataset ' 'did not match with expectations. Deleting and creating a new one.'.format(h5_new_dset.shape, source_dset.shape, h5_new_dset.dtype, dtype)) del h5_new_dset, h5_group[dset_name] h5_new_dset = h5_group.create_dataset(dset_name, **kwargs) else: raise KeyError('{} is already a {} in group: {}'.format(dset_name, type(h5_group[dset_name]), h5_group.name)) else: h5_new_dset = h5_group.create_dataset(dset_name, **kwargs) # This should link the ancillary datasets correctly h5_new_dset = hut.copy_attributes(source_dset, h5_new_dset, skip_refs=skip_refs) if source_dset.file != h5_group.file: hut.copy_linked_objects(source_dset, h5_new_dset) h5_new_dset.attrs.update(new_attrs) if check_if_main(h5_new_dset): from ..usi_data import USIDataset h5_new_dset = USIDataset(h5_new_dset) # update book keeping attributes write_book_keeping_attrs(h5_new_dset) return h5_new_dset