def get_aux_dset_slicing(dim_names, last_ind=None, is_spectroscopic=False): """ Returns a dictionary of slice objects to help in creating region references in the position or spectroscopic indices and values datasets Parameters ------------ dim_names : iterable List of strings denoting the names of the position axes or spectroscopic dimensions arranged in the same order that matches the dimensions in the indices / values dataset last_ind : (Optional) unsigned int, default = None Last pixel in the positon or spectroscopic matrix. Useful in experiments where the parameters have changed (eg. BEPS new data format) during the experiment. is_spectroscopic : bool, optional. default = True set to True for position datasets and False for spectroscopic datasets Returns ------------ slice_dict : dictionary Dictionary of tuples containing slice objects corresponding to each position axis. """ dim_names = validate_list_of_strings(dim_names, 'dim_names') if len(dim_names) == 0: raise ValueError('No valid dim_names provided') slice_dict = dict() for spat_ind, curr_dim_name in enumerate(dim_names): val = (slice(last_ind), slice(spat_ind, spat_ind + 1)) if is_spectroscopic: val = val[::-1] slice_dict[str(curr_dim_name)] = val return slice_dict
def can_read(self, *args, **kwargs): """ Checks whether the provided file can be read by this reader. This basic function compares the file extension against the ``extension`` keyword argument. If the extension matches, this function returns True Parameters ---------- extension : str or iterable of str, Optional. Default = None File extension for the input file. Returns ------- file_path : str Path to the file that needs to be provided to read() if the provided file was indeed a valid file Else, None Raises ------ NotImplementedError : if this function is called for this or a child class that does not provide the ``extension`` keyword argument Notes ----- It is recommended to add additional checks as necessary to ensure that the translator can indeed read the given file such as by validating the headers or similar metadata. """ targ_ext = kwargs.get('extension', None) if not targ_ext: raise NotImplementedError('Either can_read() has not been ' 'implemented by this Reader or the ' '"extension" keyword argument was ' 'missing') if isinstance(targ_ext, (str, unicode)): targ_ext = [targ_ext] targ_ext = validate_list_of_strings(targ_ext, parm_name='(keyword argument) ' '"extension"') # Get rid of any '.' separators that may be in the list of extensions # Also turn to lower case for case insensitive comparisons targ_ext = [item.replace('.', '').lower() for item in targ_ext] file_path = os.path.abspath(self._input_file_path) extension = os.path.splitext(file_path)[1][1:] # Ensure extension is lower case just like targets above extension = extension.lower() if extension in targ_ext: return file_path else: return None
def is_valid_file(file_path, *args, **kwargs): """ Checks whether the provided file can be read by this translator. This basic function compares the file extension against the "extension" keyword argument. If the extension matches, this function returns True Parameters ---------- file_path : str Path to raw data file Returns ------- file_path : str Path to the file that needs to be provided to translate() if the provided file was indeed a valid file Else, None """ file_path = validate_single_string_arg(file_path, 'file_name') if not os.path.exists(file_path): raise FileNotFoundError(file_path + ' does not exist') targ_ext = kwargs.get('extension', None) if not targ_ext: raise NotImplementedError('Either is_valid_file() has not been ' 'implemented by this translator or the ' '"extension" keyword argument was ' 'missing') if isinstance(targ_ext, (str, unicode)): targ_ext = [targ_ext] targ_ext = validate_list_of_strings(targ_ext, parm_name='(keyword argument) ' '"extension"') # Get rid of any '.' separators that may be in the list of extensions # Also turn to lower case for case insensitive comparisons targ_ext = [item.replace('.', '').lower() for item in targ_ext] file_path = os.path.abspath(file_path) extension = os.path.splitext(file_path)[1][1:] # Ensure extension is lower case just like targets above extension = extension.lower() if extension in targ_ext: return file_path else: return None
def get_attributes(h5_object, attr_names=None, strict=False): """ Returns attribute associated with some DataSet. Parameters ---------- h5_object : :class:`h5py.Dataset` Dataset object reference. attr_names : str or :class:`list` of str, optional. Default = all Name of attribute object to return. strict : bool, optional. Default = False If True - raises a KeyError if desired keys are not found. Else, raises warning instead. This is especially useful when attempting to read attributes with invalid names such as spaces on either sides of text. Returns ------- att_dict : dict Dictionary containing (name,value) pairs of attributes """ if not isinstance(h5_object, (h5py.Dataset, h5py.Group, h5py.File)): raise TypeError( 'h5_object should be a h5py.Dataset, h5py.Group or h5py.File object' ) if attr_names is None: attr_names = h5_object.attrs.keys() else: attr_names = validate_list_of_strings(attr_names, 'attr_names') # Set strict to True since user is looking for specific attributes strict = True att_dict = {} for attr in attr_names: try: att_dict[attr] = get_attr(h5_object, attr) except KeyError: mesg = '"{}" is not an attribute of {}'.format( attr, h5_object.name) if strict: raise KeyError(mesg) else: warn(mesg) return att_dict
def get_auxiliary_datasets(h5_object, aux_dset_name=None): """ Returns auxiliary dataset objects associated with some DataSet through its attributes. Note - region references will be ignored. Parameters ---------- h5_object : :class:`h5py.Dataset`, :class:`h5py.Group` or :class:`h5py.File` Dataset object reference. aux_dset_name : str or :class:`list` of str, optional. Default = all Name of auxiliary :class:`h5py.Dataset` objects to return. Returns ------- list of :class:`h5py.Reference` of auxiliary :class:`h5py.Dataset` objects. """ if not isinstance(h5_object, (h5py.Dataset, h5py.Group, h5py.File)): raise TypeError( 'h5_object should be a h5py.Dataset, h5py.Group or h5py.File object' ) if aux_dset_name is None: aux_dset_name = h5_object.attrs.keys() else: aux_dset_name = validate_list_of_strings(aux_dset_name, 'aux_dset_name') data_list = list() curr_name = None try: h5_file = h5_object.file for curr_name in aux_dset_name: h5_ref = h5_object.attrs[curr_name] if isinstance(h5_ref, h5py.Reference) and isinstance(h5_file[h5_ref], h5py.Dataset) and not \ isinstance(h5_ref, h5py.RegionReference): data_list.append(h5_file[h5_ref]) except KeyError: raise KeyError('%s is not an attribute of %s' % (str(curr_name), h5_object.name)) return data_list
def get_h5_obj_refs(obj_names, h5_refs): """ Given a list of H5 references and a list of names, this method returns H5 objects corresponding to the names Parameters ---------- obj_names : string or List of strings names of target h5py objects h5_refs : H5 object reference or List of H5 object references list containing the target reference Returns ------- found_objects : List of HDF5 dataset references Corresponding references """ obj_names = validate_list_of_strings(obj_names, 'attr_names') if isinstance(h5_refs, (h5py.File, h5py.Group, h5py.Dataset)): h5_refs = [h5_refs] if not isinstance(h5_refs, (list, tuple)): raise TypeError( 'h5_refs should be a / list of h5py.Dataset, h5py.Group or h5py.File object(s)' ) found_objects = [] for target_name in obj_names: for h5_object in h5_refs: if not isinstance(h5_object, (h5py.File, h5py.Group, h5py.Dataset)): continue if h5_object.name.split('/')[-1] == target_name: found_objects.append(h5_object) return found_objects
def write_reduced_anc_dsets(h5_parent_group, h5_inds, h5_vals, dim_name, basename=None, is_spec=None, verbose=False): """ Creates new Ancillary Indices and Values datasets from the input datasets by dropping the specified dimensions Parameters ---------- h5_parent_group : :class:`h5py.Group` or h5py.File Group under which the indices and values datasets will be created h5_inds : HDF5 Dataset Spectroscopic or Positions indices dataset h5_vals : HDF5 Dataset Spectroscopic or Positions values dataset dim_name : str or unicode or list of strings Names of the dimension(s) to remove basename : str or unicode, Optional String to which '_Indices' and '_Values' will be appended to get the names of the new datasets. Default = 'Position' or 'Spectroscopic' is_spec : bool, optional Whether or not the provided ancillary datasets are position or spectroscopic The user is recommended to supply this parameter whenever it is known or possible. By default, this function will attempt to recognize the answer based on the shape of the datasets. verbose : bool, optional. Default = False Whether or not to print debugging print statements Returns ------- h5_inds_new : h5py.Dataset Reduced indices dataset h5_vals_new : h5py.Dataset Reduces values dataset """ if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError('h5_parent_group should either be a h5py. Group or File object') for param, param_name in zip([h5_inds, h5_vals], ['h5_inds', 'h5_vals']): if not isinstance(param, h5py.Dataset): raise TypeError(param_name + ' should be a h5py.Dataset object') if dim_name is not None: dim_name = validate_list_of_strings(dim_name, 'dim_name') all_dim_names = list(get_attr(h5_inds, 'labels')) for item in dim_name: if item not in all_dim_names: raise KeyError('Requested dimension: {} not in the list of labels: {}'.format(item, all_dim_names)) ind_mat = h5_inds[()] val_mat = h5_vals[()] if is_spec is None: # Attempt to recognize the type automatically is_spec = False if ind_mat.shape[0] == ind_mat.shape[1]: raise ValueError('Unable automatically guess whether the provided datasets are position or ' 'spectroscopic. Please explicitely specify via the "is_spec" boolean kwarg') if ind_mat.shape[0] < ind_mat.shape[1]: is_spec = True else: if not isinstance(is_spec, bool): raise TypeError('is_spec should be a boolean. Provided object is of type: {}'.format(type(is_spec))) if basename is not None: basename = validate_single_string_arg(basename, 'basename') if basename.endswith('_'): basename = basename[:-1] else: if is_spec: basename = 'Spectroscopic' else: basename = 'Position' for sub_name in ['_Indices', '_Values']: if basename + sub_name in h5_parent_group.keys(): raise KeyError('Dataset: {} already exists in provided group: {}'.format(basename + sub_name, h5_parent_group.name)) if set(dim_name) != set(all_dim_names): # At least one dimension will remain if verbose: print('All Dimensions: {}. Dimensions to be removed: {}'.format(all_dim_names, dim_name)) if not is_spec: # Convert to spectral shape ind_mat = np.transpose(ind_mat) val_mat = np.transpose(val_mat) # For all dimensions, find where the index = 0 # basically, we are indexing all dimensions to 0 first_indices = [] keep_dim = np.ones(len(all_dim_names), dtype=bool) for cur_dim in dim_name: dim_ind = all_dim_names.index(cur_dim) keep_dim[dim_ind] = False # check equality against the minimum value instead of 0 to account for cases when a dimension does not start # from 0 (already been sliced) - think of multi-dimensional slicing! first_indices.append(ind_mat[dim_ind] == np.min(ind_mat[dim_ind])) first_indices = np.vstack(first_indices) if verbose: print('Raw first_indices:') print(first_indices) print('Dimensions to keep: {}'.format(keep_dim)) step_starts = np.all(first_indices, axis=0) if verbose: print('Columns in dataset to keep:') print(step_starts) ''' Extract all rows that we want to keep from input indices and values ''' # TODO: handle TypeError: Indexing elements must be in increasing order ind_mat = ind_mat[keep_dim, :][:, step_starts] val_mat = val_mat[keep_dim, :][:, step_starts] if not is_spec: # Convert back to position shape ind_mat = np.transpose(ind_mat) val_mat = np.transpose(val_mat) ''' Create new Datasets to hold the data Name them based on basename ''' h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=ind_mat, dtype=h5_inds.dtype) h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=val_mat, dtype=h5_vals.dtype) # Extracting the labels from the original spectroscopic data sets labels = h5_inds.attrs['labels'][keep_dim] # Creating the dimension slices for the new spectroscopic data sets # Adding the labels and units to the new spectroscopic data sets for dset in [h5_inds_new, h5_vals_new]: write_simple_attrs(dset, {'labels': labels, 'units': h5_inds.attrs['units'][keep_dim]}) else: # Remove all dimensions: h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=np.array([[0]]), dtype=INDICES_DTYPE) h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=np.array([[0]]), dtype=VALUES_DTYPE) for dset in [h5_inds_new, h5_vals_new]: write_simple_attrs(dset, {'labels': ['Single_Step'], 'units': ['a. u.']}) return h5_inds_new, h5_vals_new
def get_unit_values(ds_inds, ds_vals, dim_names=None, all_dim_names=None, is_spec=None, verbose=False): """ Gets the unit arrays of values that describe the spectroscopic dimensions Parameters ---------- ds_inds : h5py.Dataset or numpy.ndarray Spectroscopic or Position Indices dataset ds_vals : h5py.Dataset or numpy.ndarray Spectroscopic or Position Values dataset dim_names : str, or list of str, Optional Names of the dimensions of interest. Default = all all_dim_names : list of str, Optional Names of all the dimensions in these datasets. Use this if supplying numpy arrays instead of h5py.Dataset objects for h5_inds, h5_vals since there is no other way of getting the dimension names. is_spec : bool, optional Whether or not the provided ancillary datasets are position or spectroscopic The user is recommended to supply this parameter whenever it is known By default, this function will attempt to recognize the answer based on the shape of the datasets. verbose : bool, optional Whether or not to print debugging statements. Default - off Note - this function can be extended / modified for ancillary position dimensions as well Returns ------- unit_values : dict Dictionary containing the unit array for each dimension. The name of the dimensions are the keys. """ if all_dim_names is None: allowed_types = h5py.Dataset else: all_dim_names = validate_list_of_strings(all_dim_names, 'all_dim_names') all_dim_names = np.array(all_dim_names) allowed_types = (h5py.Dataset, np.ndarray) for dset, dset_name in zip([ds_inds, ds_vals], ['ds_inds', 'ds_vals']): if not isinstance(dset, allowed_types): raise TypeError(dset_name + ' should be of type: {}'.format(allowed_types)) # For now, we will throw an error if even a single dimension is listed as an incomplete dimension: if isinstance(ds_inds, h5py.Dataset): if np.any([ 'incomplete_dimensions' in dset.attrs.keys() for dset in [ds_inds, ds_vals] ]): try: incomp_dims_inds = get_attr(ds_inds, 'incomplete_dimensions') except KeyError: incomp_dims_inds = None try: incomp_dims_vals = get_attr(ds_vals, 'incomplete_dimensions') except KeyError: incomp_dims_vals = None if incomp_dims_inds is None and incomp_dims_vals is not None: incomp_dims = incomp_dims_vals elif incomp_dims_inds is not None and incomp_dims_vals is None: incomp_dims = incomp_dims_inds else: # ensure that both attributes are the same if incomp_dims_vals != incomp_dims_inds: raise ValueError( 'Provided indices ({}) and values ({}) datasets were marked with different values ' 'for incomplete_datasets.'.format( incomp_dims_inds, incomp_dims_vals)) incomp_dims = incomp_dims_vals all_dim_names = get_attr(ds_inds, 'labels') raise ValueError( 'Among all dimensions: {}, These dimensions were marked as incomplete dimensions: {}' '. You are recommended to find unit values manually'.format( all_dim_names, incomp_dims)) # Do we need to check that the provided inds and vals correspond to the same main dataset? if ds_inds.shape != ds_vals.shape: raise ValueError( 'h5_inds: {} and h5_vals: {} should have the same shapes'.format( ds_inds.shape, ds_vals.shape)) if all_dim_names is None: all_dim_names = get_attr(ds_inds, 'labels') if verbose: print('All dimensions: {}'.format(all_dim_names)) # First load to memory inds_mat = ds_inds[()] vals_mat = ds_vals[()] if is_spec is None: # Attempt to recognize the type automatically is_spec = False if inds_mat.shape[0] < inds_mat.shape[1]: is_spec = True else: if not isinstance(is_spec, bool): raise TypeError( 'is_spec should be a boolean. Provided object is of type: {}'. format(type(is_spec))) if verbose: print( 'Ancillary matrices of shape: {}, hence determined to be Spectroscopic:{}' .format(inds_mat.shape, is_spec)) if not is_spec: # Convert to spectral shape inds_mat = np.transpose(inds_mat) vals_mat = np.transpose(vals_mat) if len(all_dim_names) != inds_mat.shape[0]: raise ValueError( 'Length of dimension names list: {} not matching with shape of dataset: {}' '.'.format(len(all_dim_names), inds_mat.shape[0])) if dim_names is None: dim_names = all_dim_names if verbose: print('Going to return unit values for all dimensions: {}'.format( all_dim_names)) else: dim_names = validate_list_of_strings(dim_names, 'dim_names') if verbose: print( 'Checking to make sure that the target dimension names: {} exist in the datasets attributes: {}' '.'.format(dim_names, all_dim_names)) # check to make sure that the dimension names exist in the datasets: for dim_name in dim_names: if dim_name not in all_dim_names: raise KeyError( 'Dimension {} does not exist in the provided ancillary datasets' .format(dim_name)) unit_values = dict() for dim_name in all_dim_names: # Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice: if verbose: print('Looking for dimension: {} in {}'.format( dim_name, dim_names)) desired_row_ind = np.where(all_dim_names == dim_name)[0][0] inds_for_dim = inds_mat[desired_row_ind] # Wherever this dimension goes to 0 - start of a new tile starts = np.where(inds_for_dim == np.min(inds_for_dim))[0] if starts[0] != 0: raise ValueError('Spectroscopic Indices for dimension: "{}" not ' 'starting with 0. Please fix this and try again' '.'.format(dim_name)) # There may be repetitions in addition to tiling. Find how the the positions increase. # 1 = repetition, > 1 = new tile step_sizes = np.hstack(([1], np.diff(starts))) # This array is of the same length as the full indices array # We should expect only two values of step sizes for a regular dimension (tiles of the same size): # 1 for same value repeating and a big jump in indices when the next tile starts # If the repeats / tiles are of different lengths, then this is not a regular dimension. # What does a Unit Values vector even mean in this case? Just raise an error for now if np.where(np.unique(step_sizes) - 1)[0].size > 1: raise ValueError('Non constant step sizes') # Finding Start of a new tile tile_starts = np.where(step_sizes > 1)[0] # converting these indices to correct indices that can be mapped straight to if len(tile_starts) < 1: # Dimension(s) with no tiling at all # Make it look as though the next tile starts at the end of the whole indices vector tile_starts = np.array([0, len(inds_for_dim)]) else: # Dimension with some form of repetition tile_starts = np.hstack(([0], starts[tile_starts])) # Verify that each tile is identical here # Last tile will not be checked unless we add the length of the indices vector as the start of next tile tile_starts = np.hstack((tile_starts, [len(inds_for_dim)])) subsections = [ inds_for_dim[tile_starts[ind]:tile_starts[ind + 1]] for ind in range(len(tile_starts) - 1) ] if np.max(np.diff(subsections, axis=0)) != 0: # Should get unit values for ALL dimensions regardless of expectations to catch such scenarios. raise ValueError( 'Values in each tile of dimension: {} are different'. format(dim_name)) # Now looking within the first tile: subsection = inds_for_dim[tile_starts[0]:tile_starts[1]] # remove all repetitions. ie - take indices only where jump == 1 step_inds = np.hstack( ([0], np.where(np.hstack(([0], np.diff(subsection))))[0])) # Finally, use these indices to get the values if dim_name in dim_names: # Only add this dimension to dictionary if requwested. unit_values[dim_name] = vals_mat[desired_row_ind, step_inds] return unit_values