def setUp(self): data_utils.make_beps_file() self.orig_labels_order = ['X', 'Y', 'Cycle', 'Bias'] self.h5_file = h5py.File(data_utils.std_beps_path, mode='r') h5_grp = self.h5_file['/Raw_Measurement/'] self.source_nd_s2f = h5_grp['n_dim_form'][()] self.source_nd_f2s = self.source_nd_s2f.transpose(1, 0, 3, 2) self.h5_source = USIDataset(h5_grp['source_main']) self.pos_dims = [] self.spec_dims = [] for dim_name, dim_units in zip( self.h5_source.pos_dim_labels, get_attr(self.h5_source.h5_pos_inds, 'units')): self.pos_dims.append( Dimension(dim_name, dim_units, h5_grp[dim_name][()])) for dim_name, dim_units in zip( self.h5_source.spec_dim_labels, get_attr(self.h5_source.h5_spec_inds, 'units')): self.spec_dims.append( Dimension(dim_name, dim_units, h5_grp[dim_name][()])) res_grp_0 = h5_grp['source_main-Fitter_000'] self.results_0_nd_s2f = res_grp_0['n_dim_form'][()] self.results_0_nd_f2s = self.results_0_nd_s2f.transpose(1, 0, 3, 2) self.h5_compound = USIDataset(res_grp_0['results_main']) res_grp_1 = h5_grp['source_main-Fitter_001'] self.results_1_nd_s2f = res_grp_1['n_dim_form'][()] self.results_1_nd_f2s = self.results_1_nd_s2f.transpose(1, 0, 3, 2) self.h5_complex = USIDataset(res_grp_1['results_main'])
def validate_anc_dset_attrs(h5_inds, h5_vals, is_spec=True): """ Validates the attributes of a pair of indices and values datasets. Throws ValueErrors if any rule is not satisfied Parameters ---------- h5_inds : h5py.Dataset Indices dataset h5_vals : h5py.Dataset Values Dataset is_spec : bool, optional. Default = True Set to True if spectroscopic. Else - Position datasets """ def lists_match(left, right): if len(left) != len(right): return False return all([l_it == r_it for l_it, r_it in zip(left, right)]) v_names = get_attr(h5_vals, 'labels') v_units = get_attr(h5_vals, 'units') i_names = get_attr(h5_inds, 'labels') i_units = get_attr(h5_inds, 'units') for names, units, dset_type in zip([v_names, i_names], [v_units, i_units], ['Values', 'Indices']): if len(names) != len(units): raise ValueError('Length of labels: {} and units: {} for the {} ' 'dataset do not match' ''.format(len(names), len(units), dset_type)) for i_item, v_item, prop in zip([i_names, i_units], [v_names, v_units], ['labels', 'units']): if not lists_match(i_item, v_item): raise ValueError('The "{}" values of the Indices: {} and Values: ' '{} datasets do not match'.format(prop, i_item, v_item)) # Now check the rows / cols nums against size of any attr: if h5_inds.shape != h5_vals.shape: raise ValueError('Shape of Indices: {} and Values: {} datasets do ' 'not match'.format(h5_inds.shape, h5_vals.shape)) dim_ind = 1 if is_spec: dim_ind = 0 if h5_inds.shape[dim_ind] != len(v_names): raise ValueError('Length of mandatory attributes: {} did not match ' 'dimension: {} of the ancillary dataset of shape: {}' ''.format(len(v_names), dim_ind, h5_inds.shape))
def test_write_reg_ref_main_one_dim(self): file_path = 'test.h5' data_utils.delete_existing_file(file_path) data = np.random.rand(7) with h5py.File(file_path, mode='w') as h5_f: h5_dset = h5_f.create_dataset('Main', data=data) reg_refs = { 'even_rows': (slice(0, None, 2)), 'odd_rows': (slice(1, None, 2)) } reg_ref.write_region_references(h5_dset, reg_refs, add_labels_attr=True) self.assertEqual(len(h5_dset.attrs), 1 + len(reg_refs)) actual = get_attr(h5_dset, 'labels') self.assertTrue( np.all([ x == y for x, y in zip(actual, ['even_rows', 'odd_rows']) ])) expected_data = [data[0:None:2], data[1:None:2]] written_data = [ h5_dset[h5_dset.attrs['even_rows']], h5_dset[h5_dset.attrs['odd_rows']] ] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) os.remove(file_path)
def test_string_representation(self): usi_dset = self.h5_source h5_main = self.h5_file[usi_dset.name] actual = usi_dset.__repr__() actual = [line.strip() for line in actual.split("\n")] actual = [actual[line_ind] for line_ind in [0, 2, 4, 7, 8, 10, 11]] expected = list() expected.append(h5_main.__repr__()) expected.append(h5_main.name) expected.append( get_attr(h5_main, "quantity") + " (" + get_attr(h5_main, "units") + ")") for h5_inds in [usi_dset.h5_pos_inds, usi_dset.h5_spec_inds]: for dim_name, dim_size in zip(get_attr(h5_inds, "labels"), get_dimensionality(h5_inds)): expected.append(dim_name + ' - size: ' + str(dim_size)) self.assertTrue(np.all([x == y for x, y in zip(actual, expected)]))
def test_simple_region_ref_copy(self): # based on test_hdf_writer.test_write_legal_reg_ref_multi_dim_data() file_path = 'test.h5' data_utils.delete_existing_file(file_path) with h5py.File(file_path, mode='w') as h5_f: data = np.random.rand(5, 7) h5_orig_dset = h5_f.create_dataset('test', data=data) self.assertIsInstance(h5_orig_dset, h5py.Dataset) attrs = { 'labels': { 'even_rows': (slice(0, None, 2), slice(None)), 'odd_rows': (slice(1, None, 2), slice(None)) } } data_utils.write_main_reg_refs(h5_orig_dset, attrs['labels']) h5_f.flush() # two atts point to region references. one for labels self.assertEqual(len(h5_orig_dset.attrs), 1 + len(attrs['labels'])) # check if the labels attribute was written: self.assertTrue( np.all([ x in list(attrs['labels'].keys()) for x in get_attr(h5_orig_dset, 'labels') ])) expected_data = [data[:None:2], data[1:None:2]] written_data = [ h5_orig_dset[h5_orig_dset.attrs['even_rows']], h5_orig_dset[h5_orig_dset.attrs['odd_rows']] ] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) # Now write a new dataset without the region reference: h5_new_dset = h5_f.create_dataset('other', data=data) self.assertIsInstance(h5_orig_dset, h5py.Dataset) h5_f.flush() for key in attrs['labels'].keys(): reg_ref.simple_region_ref_copy(h5_orig_dset, h5_new_dset, key) # now check to make sure that this dataset also has the same region references: written_data = [ h5_new_dset[h5_new_dset.attrs['even_rows']], h5_new_dset[h5_new_dset.attrs['odd_rows']] ] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) os.remove(file_path)
def get_all_dimensions(): pos_dims = [] spec_dims = [] with h5py.File(test_h5_file_path, mode='r') as h5_f: h5_raw_grp = h5_f['Raw_Measurement'] usi_main = USIDataset(h5_raw_grp['source_main']) for dim_name, dim_units in zip( usi_main.pos_dim_labels, get_attr(usi_main.h5_pos_inds, 'units')): pos_dims.append( Dimension(dim_name, dim_units, h5_raw_grp[dim_name][()])) for dim_name, dim_units in zip( usi_main.spec_dim_labels, get_attr(usi_main.h5_spec_inds, 'units')): spec_dims.append( Dimension(dim_name, dim_units, h5_raw_grp[dim_name][()])) return pos_dims, spec_dims
def test_get_indices_for_region_ref_corners(self): with h5py.File(data_utils.std_beps_path, mode='r') as h5_f: h5_main = h5_f['/Raw_Measurement/source_main'] ref_in = get_attr(h5_main, 'even_rows') ret_val = reg_ref.get_indices_for_region_ref( h5_main, ref_in, 'corners') expected_pos = np.repeat(np.arange(h5_main.shape[0])[::2], 2) expected_spec = np.tile(np.array([0, h5_main.shape[1] - 1]), expected_pos.size // 2) expected_corners = np.vstack((expected_pos, expected_spec)).T self.assertTrue(np.allclose(ret_val, expected_corners))
def test_get_indices_for_region_ref_slices(self): with h5py.File(data_utils.std_beps_path, mode='r') as h5_f: h5_main = h5_f['/Raw_Measurement/source_main'] ref_in = get_attr(h5_main, 'even_rows') ret_val = reg_ref.get_indices_for_region_ref( h5_main, ref_in, 'slices') spec_slice = slice(0, h5_main.shape[1] - 1, None) expected_slices = np.array( [[slice(x, x, None), spec_slice] for x in np.arange(h5_main.shape[0])[::2]]) self.assertTrue(np.all(ret_val == expected_slices))
def _write_results_chunk(self): """ Writes the provided SVD results to file Parameters ---------- """ comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s)) h5_svd_group = create_results_group(self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) self.h5_results_grp = h5_svd_group self._write_source_dset_provenance() write_simple_attrs(h5_svd_group, self.parms_dict) write_simple_attrs(h5_svd_group, {'svd_method': 'sklearn-randomized'}) h5_u = write_main_dataset(h5_svd_group, np.float32(self.__u), 'U', 'Abundance', 'a.u.', None, comp_dim, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, dtype=np.float32, chunks=calc_chunks(self.__u.shape, np.float32(0).itemsize)) # print(get_attr(self.h5_main, 'quantity')[0]) h5_v = write_main_dataset(h5_svd_group, self.__v, 'V', get_attr(self.h5_main, 'quantity')[0], 'a.u.', comp_dim, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals, chunks=calc_chunks(self.__v.shape, self.h5_main.dtype.itemsize)) # No point making this 1D dataset a main dataset h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s)) ''' Check h5_main for plot group references. Copy them into V if they exist ''' for key in self.h5_main.attrs.keys(): if '_Plot_Group' not in key: continue ref_inds = get_indices_for_region_ref(self.h5_main, self.h5_main.attrs[key], return_method='corners') ref_inds = ref_inds.reshape([-1, 2, 2]) ref_inds[:, 1, 0] = h5_v.shape[0] - 1 svd_ref = create_region_reference(h5_v, ref_inds) h5_v.attrs[key] = svd_ref # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = h5_svd_group.create_dataset(self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: h5_svd_group.attrs['last_pixel'] = self.h5_main.shape[0]
def _create_guess_datasets(self): """ Creates the h5 group, guess dataset, corresponding spectroscopic datasets and also links the guess dataset to the spectroscopic datasets. """ self.h5_results_grp = create_results_group( self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) write_simple_attrs(self.h5_results_grp, self.parms_dict) # If writing to a new HDF5 file: # Add back the data_type attribute - still being used in the visualizer if self.h5_results_grp.file != self.h5_main.file: write_simple_attrs( self.h5_results_grp.file, {'data_type': get_attr(self.h5_main.file, 'data_type')}) ret_vals = write_reduced_anc_dsets(self.h5_results_grp, self.h5_main.h5_spec_inds, self.h5_main.h5_spec_vals, self._fit_dim_name, verbose=self.verbose) h5_sho_inds, h5_sho_vals = ret_vals self._h5_guess = write_main_dataset( self.h5_results_grp, (self.h5_main.shape[0], self.num_udvs_steps), 'Guess', 'SHO', 'compound', None, None, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, h5_spec_inds=h5_sho_inds, h5_spec_vals=h5_sho_vals, chunks=(1, self.num_udvs_steps), dtype=sho32, main_dset_attrs=self.parms_dict, verbose=self.verbose) # Does not make sense to propagate region refs - nobody uses them # copy_region_refs(self.h5_main, self._h5_guess) self._h5_guess.file.flush() if self.verbose and self.mpi_rank == 0: print('Finished creating Guess dataset')
def copy_main_attributes(h5_main, h5_new): """ Copies the units and quantity name from one dataset to another Parameters ---------- h5_main : h5py.Dataset Dataset containing the target attributes h5_new : h5py.Dataset Dataset to which the target attributes are to be copied """ for param, param_name in zip([h5_main, h5_new], ['h5_main', 'h5_new']): if not isinstance(param, h5py.Dataset): raise TypeError(param_name + ' should be a h5py.Dataset object') for att_name in ['quantity', 'units']: if att_name not in h5_main.attrs: raise KeyError('Attribute: {} does not exist in {}'.format(att_name, h5_main)) val = get_attr(h5_main, att_name) h5_new.attrs[att_name] = clean_string_att(val)
def get_attr(h5_object, attr_name): """ Returns the attribute from the h5py object Parameters ---------- h5_object : :class:`h5py.Dataset`, :class:`h5py.Group` or :class:`h5py.File` object whose attribute is desired attr_name : str Name of the attribute of interest Returns ------- att_val : object value of attribute, in certain cases (byte strings or list of byte strings) reformatted to readily usable forms """ warn('pyUSID.io.hdf_utils.get_attr has been moved to ' 'sidpy.hdf.hdf_utils.get_attr. This copy in pyUSID will' 'be removed in future release. Please update your import statements') return hut.get_attr(h5_object, attr_name)
def __init__( self, h5_ref, ): """ Parameters ---------- h5_ref : :class:`h5py.Dataset` The dataset which is actually a USID Main dataset This dataset has dhdf5 dimensional scales Methods ------- self.slice self.data_descriptor(): returns the label of the dataset self.get_dimension_labels(): returns the labels of the dimensions self.get_dimens_types() returns dictionary of dimension_types (keys) with the axis numbers as values self.visualize(slice): not tested basic visualization of dataset based on dimension_types and slice (optional) returns fig and axis Attributes ---------- self.data_type: str The data_type (supported are: 'image', 'image_stack', 'spectrum', 'linescan' and 'spectrum_image' ) self.quantity: str The physical quantity represented in the dataset self.units: str The units of the dataset self.axes_units: list of str The units for the dimensional axes. self.axes_quantities: list of str The quantities (physical property) for the dimensional axes. self.dimension_types: list of str The dimension_types (supported is 'spatial', 'spectral', 'reciprocal' and 'time') for the dimensional axes. self.axes_first_pixels: list of int A list of the sizes of first pixel of each dimension. """ super(NSIDataset, self).__init__(h5_ref.id) self.data_type = get_attr(self, 'data_type') self.quantity = self.attrs['quantity'] self.units = self.attrs['units'] #self.axes_names = [dim.label for dim in h5_ref.dims] units = [] quantities = [] dimension_types = [] pixel_sizes = [] for dim in h5_ref.dims: units.append(get_attr(dim[0], 'units')) quantities.append(get_attr(dim[0], 'quantity')) dimension_types.append(get_attr(dim[0], 'dimension_type')) pixel_sizes.append(abs(dim[0][1] - dim[0][0])) self.axes_units = units self.axes_quantities = quantities self.dimension_types = dimension_types self.axes_first_pixels = pixel_sizes self.data_descriptor = '{} ({})'.format(get_attr(self, 'quantity'), get_attr(self, 'units'))
def reshape_to_n_dims(h5_main, h5_pos=None, h5_spec=None, get_labels=False, verbose=False, sort_dims=False, lazy=False): """ Reshape the input 2D matrix to be N-dimensions based on the position and spectroscopic datasets. Parameters ---------- h5_main : HDF5 Dataset 2D data to be reshaped h5_pos : HDF5 Dataset, optional Position indices corresponding to rows in `h5_main` h5_spec : HDF5 Dataset, optional Spectroscopic indices corresponding to columns in `h5_main` get_labels : bool, optional Whether or not to return the dimension labels. Default False verbose : bool, optional Whether or not to print debugging statements sort_dims : bool If True, the data is sorted so that the dimensions are in order from slowest to fastest If False, the data is kept in the original order If `get_labels` is also True, the labels are sorted as well. lazy : bool, optional. Default = False If False, ds_Nd will be a numpy.ndarray object - this is suitable if the HDF5 dataset fits into memory If True, ds_Nd will be a dask.array object - This is suitable if the HDF5 dataset is too large to fit into memory. Note that this will bea lazy computation meaning that the returned object just contains the instructions . In order to get the actual value or content in numpy arrays, call ds_Nd.compute() Returns ------- ds_Nd : N-D numpy array or dask.array object N dimensional array arranged as [positions slowest to fastest, spectroscopic slowest to fastest] success : boolean or string True if full reshape was successful "Positions" if it was only possible to reshape by the position dimensions False if no reshape was possible ds_labels : list of str List of the labels of each dimension of `ds_Nd` Notes ----- If either `h5_pos` or `h5_spec` are not provided, the function will first attempt to find them as attributes of `h5_main`. If that fails, it will generate dummy values for them. """ # TODO: automatically switch on lazy if the data is larger than memory # TODO: sort_dims does not appear to do much. Functions as though it was always True if h5_pos is None and h5_spec is None: if not check_if_main(h5_main): raise ValueError( 'if h5_main is a h5py.Dataset it should be a Main dataset') else: if not isinstance(h5_main, (h5py.Dataset, np.ndarray, da.core.Array)): raise TypeError( 'h5_main should either be a h5py.Dataset or numpy array') if h5_pos is not None: if not isinstance(h5_pos, (h5py.Dataset, np.ndarray, da.core.Array)): raise TypeError( 'h5_pos should either be a h5py.Dataset or numpy array') if h5_pos.shape[0] != h5_main.shape[0]: raise ValueError( 'The size of h5_pos: {} does not match with h5_main: {}'. format(h5_pos.shape, h5_main.shape)) if h5_spec is not None: if not isinstance(h5_spec, (h5py.Dataset, np.ndarray, da.core.Array)): raise TypeError( 'h5_spec should either be a h5py.Dataset or numpy array') if h5_spec.shape[1] != h5_main.shape[1]: raise ValueError( 'The size of h5_spec: {} does not match with h5_main: {}'. format(h5_spec.shape, h5_main.shape)) pos_labs = np.array(['Positions']) spec_labs = np.array(['Spectral_Step']) if h5_pos is None: """ Get the Position datasets from the references if possible """ if isinstance(h5_main, h5py.Dataset): try: h5_pos = h5_main.file[h5_main.attrs['Position_Indices']] ds_pos = h5_pos[()] pos_labs = get_attr(h5_pos, 'labels') except KeyError: print('No position datasets found as attributes of {}'.format( h5_main.name)) if len(h5_main.shape) > 1: ds_pos = np.arange(h5_main.shape[0], dtype=INDICES_DTYPE).reshape(-1, 1) pos_labs = np.array([ 'Position Dimension {}'.format(ipos) for ipos in range(ds_pos.shape[1]) ]) else: ds_pos = np.array(0, dtype=INDICES_DTYPE).reshape(-1, 1) else: ds_pos = np.arange(h5_main.shape[0], dtype=INDICES_DTYPE).reshape(-1, 1) pos_labs = np.array([ 'Position Dimension {}'.format(ipos) for ipos in range(ds_pos.shape[1]) ]) elif isinstance(h5_pos, h5py.Dataset): """ Position Indices dataset was provided """ ds_pos = h5_pos[()] pos_labs = get_attr(h5_pos, 'labels') elif isinstance(h5_pos, (np.ndarray, da.core.Array)): ds_pos = np.atleast_2d(h5_pos) pos_labs = np.array([ 'Position Dimension {}'.format(ipos) for ipos in range(ds_pos.shape[1]) ]) else: raise TypeError('Position Indices must be either h5py.Dataset or None') if h5_spec is None: """ Get the Spectroscopic datasets from the references if possible """ if isinstance(h5_main, h5py.Dataset): try: h5_spec = h5_main.file[h5_main.attrs['Spectroscopic_Indices']] ds_spec = h5_spec[()] spec_labs = get_attr(h5_spec, 'labels') except KeyError: print('No spectroscopic datasets found as attributes of {}'. format(h5_main.name)) if len(h5_main.shape) > 1: ds_spec = np.arange(h5_main.shape[1], dtype=INDICES_DTYPE).reshape([1, -1]) spec_labs = np.array([ 'Spectral Dimension {}'.format(ispec) for ispec in range(ds_spec.shape[0]) ]) else: ds_spec = np.array(0, dtype=INDICES_DTYPE).reshape([1, 1]) else: ds_spec = np.arange(h5_main.shape[1], dtype=INDICES_DTYPE).reshape([1, -1]) spec_labs = np.array([ 'Spectral Dimension {}'.format(ispec) for ispec in range(ds_spec.shape[0]) ]) elif isinstance(h5_spec, h5py.Dataset): """ Spectroscopic Indices dataset was provided """ ds_spec = h5_spec[()] spec_labs = get_attr(h5_spec, 'labels') elif isinstance(h5_spec, (np.ndarray, da.core.Array)): ds_spec = h5_spec spec_labs = np.array([ 'Spectral Dimension {}'.format(ispec) for ispec in range(ds_spec.shape[0]) ]) else: raise TypeError( 'Spectroscopic Indices must be either h5py.Dataset or None') ''' Sort the indices from fastest to slowest ''' pos_sort = get_sort_order(np.transpose(ds_pos)) spec_sort = get_sort_order(ds_spec) if verbose: print('Position dimensions:', pos_labs) print('Position sort order:', pos_sort) print('Spectroscopic Dimensions:', spec_labs) print('Spectroscopic sort order:', spec_sort) ''' Get the size of each dimension in the sorted order ''' pos_dims = get_dimensionality(np.transpose(ds_pos), pos_sort) spec_dims = get_dimensionality(ds_spec, spec_sort) if np.prod(pos_dims) != h5_main.shape[0]: mesg = 'Product of position dimension sizes: {} = {} not matching ' \ 'with size of first axis of main dataset: {}. One or more ' \ 'dimensions are dependent dimensions and not marked as such' \ '.'.format(pos_dims, np.prod(pos_dims), h5_main.shape[0]) raise ValueError(mesg) if np.prod(spec_dims) != h5_main.shape[1]: mesg = 'Product of spectroscopic dimension sizes: {} = {} not matching ' \ 'with size of second axis of main dataset: {}. One or more ' \ 'dimensions are dependent dimensions and not marked as such' \ '.'.format(spec_dims, np.prod(spec_dims), h5_main.shape[1]) raise ValueError(mesg) if verbose: print('\nPosition dimensions (sort applied):', pos_labs[pos_sort]) print('Position dimensionality (sort applied):', pos_dims) print('Spectroscopic dimensions (sort applied):', spec_labs[spec_sort]) print('Spectroscopic dimensionality (sort applied):', spec_dims) if lazy: ds_main = lazy_load_array(h5_main) else: ds_main = h5_main[()] """ Now we reshape the dataset based on those dimensions numpy reshapes correctly when the dimensions are arranged from slowest to fastest. Since the sort orders we have are from fastest to slowest, we need to reverse the orders for both the position and spectroscopic dimensions """ if verbose: print('Will attempt to reshape main dataset from:\n{} to {}'.format( ds_main.shape, pos_dims[::-1] + spec_dims[::-1])) try: ds_Nd = ds_main.reshape(pos_dims[::-1] + spec_dims[::-1]) except ValueError: warn( 'Could not reshape dataset to full N-dimensional form. Attempting reshape based on position only.' ) try: ds_Nd = ds_main.reshape(pos_dims[::-1] + [-1]) except ValueError: warn( 'Reshape by position only also failed. Will keep dataset in 2d form.' ) if get_labels: return ds_main, False, ['Position', 'Spectral Step'] else: return ds_main, False # No exception else: if get_labels: return ds_Nd, 'Positions', ['Position'] + spec_labs else: return ds_Nd, 'Positions' all_labels = np.hstack( (pos_labs[pos_sort][::-1], spec_labs[spec_sort][::-1])) if verbose: print('\nAfter reshaping, labels are', all_labels) print('Data shape is', ds_Nd.shape) """ At this point, the data is arranged from slowest to fastest dimension in both pos and spec """ if sort_dims: results = [ds_Nd, True] if get_labels: results.append(all_labels) return results if verbose: print( '\nGoing to put dimensions back in the same order as in the file:') swap_axes = list() # Compare the original order of the pos / spec labels with where these dimensions occur in the sorted labels for lab in pos_labs: swap_axes.append(np.argwhere(all_labels == lab).squeeze()) for lab in spec_labs: swap_axes.append(np.argwhere(all_labels == lab).squeeze()) swap_axes = np.array(swap_axes) if verbose: print('Axes will permuted in this order:', swap_axes) print('New labels ordering:', all_labels[swap_axes]) ds_Nd = ds_Nd.transpose(tuple(swap_axes)) results = [ds_Nd, True] if verbose: print('Dataset now of shape:', ds_Nd.shape) if get_labels: ''' Get the labels in the proper order ''' results.append(all_labels[swap_axes]) return results
def rebuild_svd(h5_main, components=None, cores=None, max_RAM_mb=1024): """ Rebuild the Image from the SVD results on the windows Optionally, only use components less than n_comp. :param h5_main: dataset which SVD was performed on :type h5_main: hdf5 Dataset :param components: Defines which components to keep Default - None, all components kept Input Types integer : Components less than the input will be kept length 2 iterable of integers : Integers define start and stop of component slice to retain other iterable of integers or slice : Selection of component indices to retain :type components: {int, iterable of int, slice} optional :param cores: How many cores should be used to rebuild Default - None, all but 2 cores will be used, min 1 :type cores: int, optional :param max_RAM_mb: Maximum ammount of memory to use when rebuilding, in Mb. Default - 1024Mb :type max_RAM_mb: int, optional :raise: KeyError if SVD results not found :returns: rebu dataset :rtype: HDF5 Dataset """ if not isinstance(h5_main, USIDataset): h5_main = USIDataset(h5_main) comp_slice, num_comps = get_component_slice( components, total_components=h5_main.shape[1]) if isinstance(comp_slice, np.ndarray): comp_slice = list(comp_slice) dset_name = h5_main.name.split('/')[-1] # Ensuring that at least one core is available for use / 2 cores are available for other use max_cores = max(1, cpu_count() - 2) # print('max_cores',max_cores) if cores is not None: cores = min(round(abs(cores)), max_cores) else: cores = max_cores max_memory = min(max_RAM_mb * 1024**2, 0.75 * get_available_memory()) if cores != 1: max_memory = int(max_memory / 2) ''' Get the handles for the SVD results ''' try: h5_svd_group = find_results_groups(h5_main, 'SVD')[-1] h5_S = h5_svd_group['S'] h5_U = h5_svd_group['U'] h5_V = h5_svd_group['V'] except KeyError: raise KeyError( 'SVD Results for {dset} were not found.'.format(dset=dset_name)) except: raise func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_V) ''' Calculate the size of a single batch that will fit in the available memory ''' n_comps = h5_S[comp_slice].size mem_per_pix = (h5_U.dtype.itemsize + h5_V.dtype.itemsize * h5_V.shape[1]) * n_comps fixed_mem = h5_main.size * h5_main.dtype.itemsize if cores is None: free_mem = max_memory - fixed_mem else: free_mem = max_memory * 2 - fixed_mem batch_size = int(round(float(free_mem) / mem_per_pix)) if batch_size < 0: print('Batches listed were negative', batch_size) batch_size = 100 batch_slices = gen_batches(h5_U.shape[0], batch_size) print('Reconstructing in batches of {} positions.'.format(batch_size)) print('Batches should be {} Mb each.'.format(mem_per_pix * batch_size / 1024.0**2)) ''' Loop over all batches. ''' ds_V = np.dot(np.diag(h5_S[comp_slice]), func(h5_V[comp_slice, :])) rebuild = np.zeros((h5_main.shape[0], ds_V.shape[1])) for ibatch, batch in enumerate(batch_slices): rebuild[batch, :] += np.dot(h5_U[batch, comp_slice], ds_V) rebuild = stack_real_to_target_dtype(rebuild, h5_V.dtype) print( 'Completed reconstruction of data from SVD results. Writing to file.') ''' Create the Group and dataset to hold the rebuild data ''' rebuilt_grp = create_indexed_group(h5_svd_group, 'Rebuilt_Data') h5_rebuilt = write_main_dataset(rebuilt_grp, rebuild, 'Rebuilt_Data', get_attr(h5_main, 'quantity'), get_attr(h5_main, 'units'), None, None, h5_pos_inds=h5_main.h5_pos_inds, h5_pos_vals=h5_main.h5_pos_vals, h5_spec_inds=h5_main.h5_spec_inds, h5_spec_vals=h5_main.h5_spec_vals, chunks=h5_main.chunks, compression=h5_main.compression) if isinstance(comp_slice, slice): rebuilt_grp.attrs['components_used'] = '{}-{}'.format( comp_slice.start, comp_slice.stop) else: rebuilt_grp.attrs['components_used'] = components copy_attributes(h5_main, h5_rebuilt, skip_refs=False) h5_main.file.flush() print('Done writing reconstructed data to file.') return h5_rebuilt
def _write_results_chunk(self): """ Writes the labels and mean response to the h5 file Returns --------- h5_group : HDF5 Group reference Reference to the group that contains the decomposition results """ self.h5_results_grp = create_results_group( self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) self._write_source_dset_provenance() write_simple_attrs(self.h5_results_grp, self.parms_dict) write_simple_attrs( self.h5_results_grp, { 'n_components': self.__components.shape[0], 'n_samples': self.h5_main.shape[0] }) decomp_desc = Dimension('Endmember', 'a. u.', self.__components.shape[0]) # equivalent to V - compound / complex h5_components = write_main_dataset( self.h5_results_grp, self.__components, 'Components', get_attr(self.h5_main, 'quantity')[0], 'a.u.', decomp_desc, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals) # equivalent of U - real h5_projections = write_main_dataset( self.h5_results_grp, np.float32(self.__projection), 'Projection', 'abundance', 'a.u.', None, decomp_desc, dtype=np.float32, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals) # return the h5 group object self.h5_results_grp = self.h5_results_grp # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = self.h5_results_grp.create_dataset( self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: self.h5_results_grp.attrs['last_pixel'] = self.h5_main.shape[0] return self.h5_results_grp
def get_unit_values(ds_inds, ds_vals, dim_names=None, all_dim_names=None, is_spec=None, verbose=False): """ Gets the unit arrays of values that describe the spectroscopic dimensions Parameters ---------- ds_inds : h5py.Dataset or numpy.ndarray Spectroscopic or Position Indices dataset ds_vals : h5py.Dataset or numpy.ndarray Spectroscopic or Position Values dataset dim_names : str, or list of str, Optional Names of the dimensions of interest. Default = all all_dim_names : list of str, Optional Names of all the dimensions in these datasets. Use this if supplying numpy arrays instead of h5py.Dataset objects for h5_inds, h5_vals since there is no other way of getting the dimension names. is_spec : bool, optional Whether or not the provided ancillary datasets are position or spectroscopic The user is recommended to supply this parameter whenever it is known By default, this function will attempt to recognize the answer based on the shape of the datasets. verbose : bool, optional Whether or not to print debugging statements. Default - off Note - this function can be extended / modified for ancillary position dimensions as well Returns ------- unit_values : dict Dictionary containing the unit array for each dimension. The name of the dimensions are the keys. """ if all_dim_names is None: allowed_types = h5py.Dataset else: all_dim_names = validate_list_of_strings(all_dim_names, 'all_dim_names') all_dim_names = np.array(all_dim_names) allowed_types = (h5py.Dataset, np.ndarray) for dset, dset_name in zip([ds_inds, ds_vals], ['ds_inds', 'ds_vals']): if not isinstance(dset, allowed_types): raise TypeError(dset_name + ' should be of type: {}'.format(allowed_types)) # For now, we will throw an error if even a single dimension is listed as an incomplete dimension: if isinstance(ds_inds, h5py.Dataset): if np.any([ 'incomplete_dimensions' in dset.attrs.keys() for dset in [ds_inds, ds_vals] ]): try: incomp_dims_inds = get_attr(ds_inds, 'incomplete_dimensions') except KeyError: incomp_dims_inds = None try: incomp_dims_vals = get_attr(ds_vals, 'incomplete_dimensions') except KeyError: incomp_dims_vals = None if incomp_dims_inds is None and incomp_dims_vals is not None: incomp_dims = incomp_dims_vals elif incomp_dims_inds is not None and incomp_dims_vals is None: incomp_dims = incomp_dims_inds else: # ensure that both attributes are the same if incomp_dims_vals != incomp_dims_inds: raise ValueError( 'Provided indices ({}) and values ({}) datasets were marked with different values ' 'for incomplete_datasets.'.format( incomp_dims_inds, incomp_dims_vals)) incomp_dims = incomp_dims_vals all_dim_names = get_attr(ds_inds, 'labels') raise ValueError( 'Among all dimensions: {}, These dimensions were marked as incomplete dimensions: {}' '. You are recommended to find unit values manually'.format( all_dim_names, incomp_dims)) # Do we need to check that the provided inds and vals correspond to the same main dataset? if ds_inds.shape != ds_vals.shape: raise ValueError( 'h5_inds: {} and h5_vals: {} should have the same shapes'.format( ds_inds.shape, ds_vals.shape)) if all_dim_names is None: all_dim_names = get_attr(ds_inds, 'labels') if verbose: print('All dimensions: {}'.format(all_dim_names)) # First load to memory inds_mat = ds_inds[()] vals_mat = ds_vals[()] if is_spec is None: # Attempt to recognize the type automatically is_spec = False if inds_mat.shape[0] < inds_mat.shape[1]: is_spec = True else: if not isinstance(is_spec, bool): raise TypeError( 'is_spec should be a boolean. Provided object is of type: {}'. format(type(is_spec))) if verbose: print( 'Ancillary matrices of shape: {}, hence determined to be Spectroscopic:{}' .format(inds_mat.shape, is_spec)) if not is_spec: # Convert to spectral shape inds_mat = np.transpose(inds_mat) vals_mat = np.transpose(vals_mat) if len(all_dim_names) != inds_mat.shape[0]: raise ValueError( 'Length of dimension names list: {} not matching with shape of dataset: {}' '.'.format(len(all_dim_names), inds_mat.shape[0])) if dim_names is None: dim_names = all_dim_names if verbose: print('Going to return unit values for all dimensions: {}'.format( all_dim_names)) else: dim_names = validate_list_of_strings(dim_names, 'dim_names') if verbose: print( 'Checking to make sure that the target dimension names: {} exist in the datasets attributes: {}' '.'.format(dim_names, all_dim_names)) # check to make sure that the dimension names exist in the datasets: for dim_name in dim_names: if dim_name not in all_dim_names: raise KeyError( 'Dimension {} does not exist in the provided ancillary datasets' .format(dim_name)) unit_values = dict() for dim_name in all_dim_names: # Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice: if verbose: print('Looking for dimension: {} in {}'.format( dim_name, dim_names)) desired_row_ind = np.where(all_dim_names == dim_name)[0][0] inds_for_dim = inds_mat[desired_row_ind] # Wherever this dimension goes to 0 - start of a new tile starts = np.where(inds_for_dim == np.min(inds_for_dim))[0] if starts[0] != 0: raise ValueError('Spectroscopic Indices for dimension: "{}" not ' 'starting with 0. Please fix this and try again' '.'.format(dim_name)) # There may be repetitions in addition to tiling. Find how the the positions increase. # 1 = repetition, > 1 = new tile step_sizes = np.hstack(([1], np.diff(starts))) # This array is of the same length as the full indices array # We should expect only two values of step sizes for a regular dimension (tiles of the same size): # 1 for same value repeating and a big jump in indices when the next tile starts # If the repeats / tiles are of different lengths, then this is not a regular dimension. # What does a Unit Values vector even mean in this case? Just raise an error for now if np.where(np.unique(step_sizes) - 1)[0].size > 1: raise ValueError('Non constant step sizes') # Finding Start of a new tile tile_starts = np.where(step_sizes > 1)[0] # converting these indices to correct indices that can be mapped straight to if len(tile_starts) < 1: # Dimension(s) with no tiling at all # Make it look as though the next tile starts at the end of the whole indices vector tile_starts = np.array([0, len(inds_for_dim)]) else: # Dimension with some form of repetition tile_starts = np.hstack(([0], starts[tile_starts])) # Verify that each tile is identical here # Last tile will not be checked unless we add the length of the indices vector as the start of next tile tile_starts = np.hstack((tile_starts, [len(inds_for_dim)])) subsections = [ inds_for_dim[tile_starts[ind]:tile_starts[ind + 1]] for ind in range(len(tile_starts) - 1) ] if np.max(np.diff(subsections, axis=0)) != 0: # Should get unit values for ALL dimensions regardless of expectations to catch such scenarios. raise ValueError( 'Values in each tile of dimension: {} are different'. format(dim_name)) # Now looking within the first tile: subsection = inds_for_dim[tile_starts[0]:tile_starts[1]] # remove all repetitions. ie - take indices only where jump == 1 step_inds = np.hstack( ([0], np.where(np.hstack(([0], np.diff(subsection))))[0])) # Finally, use these indices to get the values if dim_name in dim_names: # Only add this dimension to dictionary if requwested. unit_values[dim_name] = vals_mat[desired_row_ind, step_inds] return unit_values
def validate_aux_dset_pair(test_class, h5_group, h5_inds, h5_vals, dim_names, dim_units, inds_matrix, vals_matrix=None, base_name=None, h5_main=None, is_spectral=True, slow_to_fast=False, check_reg_refs=False): if vals_matrix is None: vals_matrix = inds_matrix if base_name is None: if is_spectral: base_name = 'Spectroscopic' else: base_name = 'Position' else: test_class.assertIsInstance(base_name, (str, unicode)) if not slow_to_fast: # Sending in to Fast to Slow but what comes out is slow to fast func = np.flipud if is_spectral else np.fliplr print(inds_matrix) vals_matrix = func(vals_matrix) inds_matrix = func(inds_matrix) dim_names = dim_names[::-1] dim_units = dim_units[::-1] for h5_dset, exp_dtype, exp_name, ref_data in zip( [h5_inds, h5_vals], [INDICES_DTYPE, VALUES_DTYPE], [base_name + '_Indices', base_name + '_Values'], [inds_matrix, vals_matrix]): if isinstance(h5_main, h5py.Dataset): test_class.assertEqual(h5_main.file[h5_main.attrs[exp_name]], h5_dset) test_class.assertIsInstance(h5_dset, h5py.Dataset) test_class.assertEqual(h5_dset.parent, h5_group) test_class.assertEqual(h5_dset.name.split('/')[-1], exp_name) test_class.assertTrue(np.allclose(ref_data, h5_dset[()])) test_class.assertEqual(h5_dset.dtype, exp_dtype) test_class.assertTrue( np.all([_ in h5_dset.attrs.keys() for _ in ['labels', 'units']])) test_class.assertTrue( np.all([ x == y for x, y in zip(dim_names, get_attr(h5_dset, 'labels')) ])) test_class.assertTrue( np.all([ x == y for x, y in zip(dim_units, get_attr(h5_dset, 'units')) ])) # assert region references even though these are not used anywhere: if check_reg_refs: for dim_ind, curr_name in enumerate(dim_names): if is_spectral: expected = np.squeeze(ref_data[dim_ind]) else: expected = np.squeeze(ref_data[:, dim_ind]) actual = np.squeeze(h5_dset[h5_dset.attrs[curr_name]]) try: match = np.allclose(expected, actual) except ValueError: match = False if match: test_class.assertTrue(match) else: warn('Test for region reference: ' + curr_name + ' failed')
def write_reduced_anc_dsets(h5_parent_group, h5_inds, h5_vals, dim_name, basename=None, is_spec=None, verbose=False): """ Creates new Ancillary Indices and Values datasets from the input datasets by dropping the specified dimensions Parameters ---------- h5_parent_group : :class:`h5py.Group` or h5py.File Group under which the indices and values datasets will be created h5_inds : HDF5 Dataset Spectroscopic or Positions indices dataset h5_vals : HDF5 Dataset Spectroscopic or Positions values dataset dim_name : str or unicode or list of strings Names of the dimension(s) to remove basename : str or unicode, Optional String to which '_Indices' and '_Values' will be appended to get the names of the new datasets. Default = 'Position' or 'Spectroscopic' is_spec : bool, optional Whether or not the provided ancillary datasets are position or spectroscopic The user is recommended to supply this parameter whenever it is known or possible. By default, this function will attempt to recognize the answer based on the shape of the datasets. verbose : bool, optional. Default = False Whether or not to print debugging print statements Returns ------- h5_inds_new : h5py.Dataset Reduced indices dataset h5_vals_new : h5py.Dataset Reduces values dataset """ if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError('h5_parent_group should either be a h5py. Group or File object') for param, param_name in zip([h5_inds, h5_vals], ['h5_inds', 'h5_vals']): if not isinstance(param, h5py.Dataset): raise TypeError(param_name + ' should be a h5py.Dataset object') if dim_name is not None: dim_name = validate_list_of_strings(dim_name, 'dim_name') all_dim_names = list(get_attr(h5_inds, 'labels')) for item in dim_name: if item not in all_dim_names: raise KeyError('Requested dimension: {} not in the list of labels: {}'.format(item, all_dim_names)) ind_mat = h5_inds[()] val_mat = h5_vals[()] if is_spec is None: # Attempt to recognize the type automatically is_spec = False if ind_mat.shape[0] == ind_mat.shape[1]: raise ValueError('Unable automatically guess whether the provided datasets are position or ' 'spectroscopic. Please explicitely specify via the "is_spec" boolean kwarg') if ind_mat.shape[0] < ind_mat.shape[1]: is_spec = True else: if not isinstance(is_spec, bool): raise TypeError('is_spec should be a boolean. Provided object is of type: {}'.format(type(is_spec))) if basename is not None: basename = validate_single_string_arg(basename, 'basename') if basename.endswith('_'): basename = basename[:-1] else: if is_spec: basename = 'Spectroscopic' else: basename = 'Position' for sub_name in ['_Indices', '_Values']: if basename + sub_name in h5_parent_group.keys(): raise KeyError('Dataset: {} already exists in provided group: {}'.format(basename + sub_name, h5_parent_group.name)) if set(dim_name) != set(all_dim_names): # At least one dimension will remain if verbose: print('All Dimensions: {}. Dimensions to be removed: {}'.format(all_dim_names, dim_name)) if not is_spec: # Convert to spectral shape ind_mat = np.transpose(ind_mat) val_mat = np.transpose(val_mat) # For all dimensions, find where the index = 0 # basically, we are indexing all dimensions to 0 first_indices = [] keep_dim = np.ones(len(all_dim_names), dtype=bool) for cur_dim in dim_name: dim_ind = all_dim_names.index(cur_dim) keep_dim[dim_ind] = False # check equality against the minimum value instead of 0 to account for cases when a dimension does not start # from 0 (already been sliced) - think of multi-dimensional slicing! first_indices.append(ind_mat[dim_ind] == np.min(ind_mat[dim_ind])) first_indices = np.vstack(first_indices) if verbose: print('Raw first_indices:') print(first_indices) print('Dimensions to keep: {}'.format(keep_dim)) step_starts = np.all(first_indices, axis=0) if verbose: print('Columns in dataset to keep:') print(step_starts) ''' Extract all rows that we want to keep from input indices and values ''' # TODO: handle TypeError: Indexing elements must be in increasing order ind_mat = ind_mat[keep_dim, :][:, step_starts] val_mat = val_mat[keep_dim, :][:, step_starts] if not is_spec: # Convert back to position shape ind_mat = np.transpose(ind_mat) val_mat = np.transpose(val_mat) ''' Create new Datasets to hold the data Name them based on basename ''' h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=ind_mat, dtype=h5_inds.dtype) h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=val_mat, dtype=h5_vals.dtype) # Extracting the labels from the original spectroscopic data sets labels = h5_inds.attrs['labels'][keep_dim] # Creating the dimension slices for the new spectroscopic data sets # Adding the labels and units to the new spectroscopic data sets for dset in [h5_inds_new, h5_vals_new]: write_simple_attrs(dset, {'labels': labels, 'units': h5_inds.attrs['units'][keep_dim]}) else: # Remove all dimensions: h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=np.array([[0]]), dtype=INDICES_DTYPE) h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=np.array([[0]]), dtype=VALUES_DTYPE) for dset in [h5_inds_new, h5_vals_new]: write_simple_attrs(dset, {'labels': ['Single_Step'], 'units': ['a. u.']}) return h5_inds_new, h5_vals_new
def check_if_main(h5_main, verbose=False): """ Checks the input dataset to see if it has all the necessary features to be considered a Main dataset. This means it is 2D and has the following attributes: * Position_Indices * Position_Values * Spectroscopic_Indices * Spectroscopic_Values * quantity * units In addition, the shapes of the ancillary matrices should match with that of h5_main Parameters ---------- h5_main : HDF5 Dataset Dataset of interest verbose : Boolean (Optional. Default = False) Whether or not to print statements Returns ------- success : Boolean True if all tests pass """ try: validate_main_dset(h5_main, True) except Exception as exep: if verbose: print(exep) return False h5_name = h5_main.name.split('/')[-1] success = True # Check for Datasets dset_names = ['Position_Indices', 'Position_Values', 'Spectroscopic_Indices', 'Spectroscopic_Values'] for name in dset_names: try: h5_anc_dset = h5_main.file[h5_main.attrs[name]] success = np.all([success, isinstance(h5_anc_dset, h5py.Dataset)]) except: if verbose: print('{} not found as an attribute of {}.'.format(name, h5_name)) return False attr_success = np.all([att in h5_main.attrs for att in ['quantity', 'units']]) if not attr_success: if verbose: print('{} does not have the mandatory "quantity" and "units" attributes'.format(h5_main.name)) return False for attr_name in ['quantity', 'units']: val = get_attr(h5_main, attr_name) if not isinstance(val, (str, unicode)): if verbose: print('Attribute {} of {} found to be {}. Expected a string'.format(attr_name, h5_main.name, val)) return False # Blindly linking four datasets is still not sufficient. The sizes need to match: anc_shape_match = list() h5_pos_inds = h5_main.file[h5_main.attrs['Position_Indices']] h5_pos_vals = h5_main.file[h5_main.attrs['Position_Values']] anc_shape_match.append(np.all(h5_pos_vals.shape == h5_pos_inds.shape)) for anc_dset in [h5_pos_vals, h5_pos_inds]: anc_shape_match.append(np.all(h5_main.shape[0] == anc_dset.shape[0])) if not np.all(anc_shape_match): if verbose: print('The shapes of the Position indices:{}, values:{} datasets did not match with that of the main ' 'dataset: {}'.format(h5_pos_inds.shape, h5_pos_vals.shape, h5_main.shape)) return False anc_shape_match = list() h5_spec_inds = h5_main.file[h5_main.attrs['Spectroscopic_Indices']] h5_spec_vals = h5_main.file[h5_main.attrs['Spectroscopic_Values']] anc_shape_match.append(np.all(h5_spec_inds.shape == h5_spec_vals.shape)) for anc_dset in [h5_spec_inds, h5_spec_vals]: anc_shape_match.append(np.all(h5_main.shape[1] == anc_dset.shape[1])) if not np.all(anc_shape_match): if verbose: print('The shapes of the Spectroscopic indices:{}, values:{} datasets did not match with that of the main ' 'dataset: {}'.format(h5_spec_inds.shape, h5_spec_vals.shape, h5_main.shape)) return False try: validate_anc_dset_attrs(h5_pos_inds, h5_pos_vals, is_spec=False) except ValueError: if verbose: print('Attributes of Position datasets did not match') return False try: validate_anc_dset_attrs(h5_spec_inds, h5_spec_vals, is_spec=True) except ValueError: if verbose: print('Attributes of Spectroscopic datasets did not match') return False return success
def translate(self, h5_path, force_patch=False, **kwargs): """ Add the needed references and attributes to the h5 file that are not created by the LabView data aquisition program. Parameters ---------- h5_path : str path to the h5 file force_patch : bool, optional Should the check to see if the file has already been patched be ignored. Default False. Returns ------- h5_file : str path to the patched dataset """ #TODO: Need a way to choose which channels to apply the patcher to, #fails for multi-channel files where not all files are capable of being main datasets # Open the file and check if a patch is needed h5_file = h5py.File(os.path.abspath(h5_path), 'r+') if h5_file.attrs.get('translator') is not None and not force_patch: print('File is already Pycroscopy ready.') h5_file.close() return h5_path ''' Get the list of all Raw_Data Datasets Loop over the list and update the needed attributes ''' raw_list = find_dataset(h5_file, 'Raw_Data') for h5_raw in raw_list: if 'quantity' not in h5_raw.attrs: h5_raw.attrs['quantity'] = 'quantity' if 'units' not in h5_raw.attrs: h5_raw.attrs['units'] = 'a.u.' # Grab the channel and measurement group of the data to check some needed attributes h5_chan = h5_raw.parent try: c_type = get_attr(h5_chan, 'channel_type') except KeyError: warn_str = "'channel_type' was not found as an attribute of {}.\n".format( h5_chan.name) warn_str += "If this is BEPS or BELine data from the LabView aquisition software, " + \ "please run the following piece of code. Afterwards, run this function again.\n" + \ "CODE: " \ "hdf.file['{}'].attrs['channel_type'] = 'BE'".format(h5_chan.name) warn(warn_str) h5_file.close() return h5_path except: raise if c_type != 'BE': continue h5_meas = h5_chan.parent h5_meas.attrs['num_UDVS_steps'] = h5_meas.attrs['num_steps'] # Get the object handles for the Indices and Values datasets h5_pos_inds = h5_chan['Position_Indices'] h5_pos_vals = h5_chan['Position_Values'] h5_spec_inds = h5_chan['Spectroscopic_Indices'] h5_spec_vals = h5_chan['Spectroscopic_Values'] # Make sure we have correct spectroscopic indices for the given values ds_spec_inds = create_spec_inds_from_vals(h5_spec_vals[()]) if not np.allclose(ds_spec_inds, h5_spec_inds[()]): h5_spec_inds[:, :] = ds_spec_inds[:, :] h5_file.flush() # Get the labels and units for the Spectroscopic datasets h5_spec_labels = h5_spec_inds.attrs['labels'] inds_and_vals = [ h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals ] for dset in inds_and_vals: spec_labels = dset.attrs['labels'] try: spec_units = dset.attrs['units'] if len(spec_units) != len(spec_labels): raise KeyError except KeyError: dset['units'] = ['' for _ in spec_labels] except: raise """" In early versions, too many spectroscopic dimension labels and units were listed compared to the number of rows. Remove here: """ remove_non_exist_spec_dim_labs(h5_spec_inds, h5_spec_vals, h5_meas, verbose=False) """ Add back some standard metadata to be consistent with older BE data """ missing_metadata = dict() if 'File_file_name' not in h5_meas.attrs.keys(): missing_metadata['File_file_name'] = os.path.split( h5_raw.file.filename)[-1].replace('.h5', '') if 'File_date_and_time' not in h5_meas.attrs.keys(): try: date_str = get_attr(h5_raw.file, 'date_string') time_str = get_attr(h5_raw.file, 'time_string') full_str = date_str.strip() + ' ' + time_str.strip() """ convert: date_string : 2018-12-05 time_string : 3:41:45 PM to: File_date_and_time: 19-Jun-2009 18:44:56 """ try: dt_obj = datetime.datetime.strptime( full_str, "%Y-%m-%d %I:%M:%S %p") missing_metadata[ 'File_date_and_time'] = dt_obj.strftime( '%d-%b-%Y %H:%M:%S') except ValueError: pass except KeyError: pass # Now write to measurement group: if len(missing_metadata) > 0: write_simple_attrs(h5_meas, missing_metadata) # Link the references to the Indices and Values datasets to the Raw_Data print(h5_raw.shape, h5_pos_vals.shape, h5_spec_vals.shape) print(h5_spec_inds.shape, h5_pos_inds.shape) link_as_main(h5_raw, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals) # Also link the Bin_Frequencies and Bin_Wfm_Type datasets h5_freqs = h5_chan['Bin_Frequencies'] aux_dset_names = ['Bin_Frequencies'] aux_dset_refs = [h5_freqs.ref] check_and_link_ancillary(h5_raw, aux_dset_names, anc_refs=aux_dset_refs) ''' Get all SHO_Fit groups for the Raw_Data and loop over them Get the Guess and Spectroscopic Datasets for each SHO_Fit group ''' sho_list = find_results_groups(h5_raw, 'SHO_Fit') for h5_sho in sho_list: h5_sho_guess = h5_sho['Guess'] h5_sho_spec_inds = h5_sho['Spectroscopic_Indices'] h5_sho_spec_vals = h5_sho['Spectroscopic_Values'] # Make sure we have correct spectroscopic indices for the given values ds_sho_spec_inds = create_spec_inds_from_vals( h5_sho_spec_inds[()]) if not np.allclose(ds_sho_spec_inds, h5_sho_spec_inds[()]): h5_sho_spec_inds[:, :] = ds_sho_spec_inds[:, :] # Get the labels and units for the Spectroscopic datasets h5_sho_spec_labels = get_attr(h5_sho_spec_inds, 'labels') h5_sho_spec_units = get_attr(h5_sho_spec_vals, 'units') if h5_sho_spec_inds.shape[-1] != h5_sho_guess.shape[-1]: print( 'Warning! Found incorrect spectral dimension for dataset {}. Attempting a fix.' .format(h5_sho_guess)) try: h5_sho_spec_inds = h5_sho_guess.parent.create_dataset( "h5_sho_spec_inds_fixed", shape=(1, 1), dtype='uint32') h5_sho_spec_inds.attrs['labels'] = 'labels' h5_sho_spec_inds.attrs['units'] = 'units' except RuntimeError: print( "It seems that the file has already been patched." " Will use previously computed ancilliary datasets" ) h5_sho_spec_inds = h5_sho_guess.parent[ 'h5_sho_spec_inds_fixed'] try: h5_sho_spec_vals = h5_sho_guess.parent.create_dataset( "h5_sho_spec_vals_fixed", shape=(1, 1), dtype='uint32') h5_sho_spec_vals[:] = 0 h5_sho_spec_vals.attrs['labels'] = 'labels' h5_sho_spec_vals.attrs['units'] = 'units' except RuntimeError: print( "It seems that the file has already been patched." " Will use previously computed ancilliary datasets" ) h5_sho_spec_vals = h5_sho_guess.parent[ 'h5_sho_spec_vals_fixed2'] link_as_main(h5_sho_guess, h5_pos_inds, h5_pos_vals, h5_sho_spec_inds, h5_sho_spec_vals) sho_inds_and_vals = [h5_sho_spec_inds, h5_sho_spec_vals] for dset in sho_inds_and_vals: spec_labels = get_attr(dset, 'labels') try: spec_units = get_attr(dset, 'units') if len(spec_units) != len(spec_labels): raise KeyError except KeyError: spec_units = [''.encode('utf-8') for _ in spec_labels] dset.attrs['units'] = spec_units except: raise h5_file.flush() h5_file.attrs['translator'] = 'V3patcher'.encode('utf-8') h5_file.close() return h5_path
def check_for_matching_attrs(h5_obj, new_parms=None, verbose=False): """ Compares attributes in the given H5 object against those in the provided dictionary and returns True if the parameters match, and False otherwise Parameters ---------- h5_obj : h5py object (Dataset or :class:`h5py.Group`) Object whose attributes will be compared against new_parms new_parms : dict, optional. default = empty dictionary Parameters to compare against the attributes present in h5_obj verbose : bool, optional, default = False Whether or not to print debugging statements Returns ------- tests: bool Whether or not all paramters in new_parms matched with those in h5_obj's attributes """ if not isinstance(h5_obj, (h5py.Dataset, h5py.Group, h5py.File)): raise TypeError('h5_obj should be a h5py.Dataset, h5py.Group, or h5py.File object') if new_parms is None: new_parms = dict() else: if not isinstance(new_parms, dict): raise TypeError('new_parms should be a dictionary') tests = [] for key in new_parms.keys(): if verbose: print('Looking for new attribute named: {}'.format(key)) # HDF5 cannot store None as an attribute anyway. ignore if new_parms[key] is None: continue try: old_value = get_attr(h5_obj, key) except KeyError: # if parameter was not found assume that something has changed if verbose: print('New parm: {} \t- new parm not in group *****'.format(key)) tests.append(False) break if isinstance(old_value, np.ndarray): if not isinstance(new_parms[key], collections.Iterable): if verbose: print('New parm: {} \t- new parm not iterable unlike old parm *****'.format(key)) tests.append(False) break new_array = np.array(new_parms[key]) if old_value.size != new_array.size: if verbose: print('New parm: {} \t- are of different sizes ****'.format(key)) tests.append(False) else: try: answer = np.allclose(old_value, new_array) except TypeError: # comes here when comparing string arrays # Not sure of a better way answer = [] for old_val, new_val in zip(old_value, new_array): answer.append(old_val == new_val) answer = np.all(answer) if verbose: print('New parm: {} \t- match: {}'.format(key, answer)) tests.append(answer) else: """if isinstance(new_parms[key], collections.Iterable): if verbose: print('New parm: {} \t- new parm is iterable unlike old parm *****'.format(key)) tests.append(False) break""" answer = np.all(new_parms[key] == old_value) if verbose: print('New parm: {} \t- match: {}'.format(key, answer)) tests.append(answer) if verbose: print('') return all(tests)
def _write_results_chunk(self): """ Writes the labels and mean response to the h5 file Returns --------- h5_group : HDF5 Group reference Reference to the group that contains the clustering results """ print('Writing clustering results to file.') num_clusters = self.__mean_resp.shape[0] self.h5_results_grp = create_results_group( self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) self._write_source_dset_provenance() write_simple_attrs(self.h5_results_grp, self.parms_dict) h5_labels = write_main_dataset(self.h5_results_grp, np.uint32(self.__labels.reshape([-1, 1])), 'Labels', 'Cluster ID', 'a. u.', None, Dimension('Cluster', 'ID', 1), h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, aux_spec_prefix='Cluster_', dtype=np.uint32) if self.num_comps != self.h5_main.shape[1]: ''' Setup the Spectroscopic Indices and Values for the Mean Response if we didn't use all components Note that a sliced spectroscopic matrix may not be contiguous. Let's just lose the spectroscopic data for now until a better method is figured out ''' """ if isinstance(self.data_slice[1], np.ndarray): centroid_vals_mat = h5_centroids.h5_spec_vals[self.data_slice[1].tolist()] else: centroid_vals_mat = h5_centroids.h5_spec_vals[self.data_slice[1]] ds_centroid_values.data[0, :] = centroid_vals_mat """ if isinstance(self.data_slice[1], np.ndarray): vals_slice = self.data_slice[1].tolist() else: vals_slice = self.data_slice[1] vals = self.h5_main.h5_spec_vals[:, vals_slice].squeeze() new_spec = Dimension('Original_Spectral_Index', 'a.u.', vals) h5_inds, h5_vals = write_ind_val_dsets(self.h5_results_grp, new_spec, is_spectral=True) else: h5_inds = self.h5_main.h5_spec_inds h5_vals = self.h5_main.h5_spec_vals # For now, link centroids with default spectroscopic indices and values. h5_centroids = write_main_dataset(self.h5_results_grp, self.__mean_resp, 'Mean_Response', get_attr(self.h5_main, 'quantity')[0], get_attr(self.h5_main, 'units')[0], Dimension('Cluster', 'a. u.', np.arange(num_clusters)), None, h5_spec_inds=h5_inds, aux_pos_prefix='Mean_Resp_Pos_', h5_spec_vals=h5_vals) # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = self.h5_results_grp.create_dataset( self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: self.h5_results_grp.attrs['last_pixel'] = self.h5_main.shape[0] return self.h5_results_grp
def reshape_from_lines_to_pixels(h5_main, pts_per_cycle, scan_step_x_m=None): """ Breaks up the provided raw G-mode dataset into lines and pixels (from just lines) Parameters ---------- h5_main : h5py.Dataset object Reference to the main dataset that contains the raw data that is only broken up by lines pts_per_cycle : unsigned int Number of points in a single pixel scan_step_x_m : float Step in meters for pixels Returns ------- h5_resh : h5py.Dataset object Reference to the main dataset that contains the reshaped data """ if not check_if_main(h5_main): raise TypeError('h5_main is not a Main dataset') h5_main = USIDataset(h5_main) if pts_per_cycle % 1 != 0 or pts_per_cycle < 1: raise TypeError('pts_per_cycle should be a positive integer') if scan_step_x_m is not None: if not isinstance(scan_step_x_m, Number): raise TypeError('scan_step_x_m should be a real number') else: scan_step_x_m = 1 if h5_main.shape[1] % pts_per_cycle != 0: warn( 'Error in reshaping the provided dataset to pixels. Check points per pixel' ) raise ValueError num_cols = int(h5_main.shape[1] / pts_per_cycle) # TODO: DO NOT assume simple 1 spectral dimension! single_ao = np.squeeze(h5_main.h5_spec_vals[:, :pts_per_cycle]) spec_dims = Dimension( get_attr(h5_main.h5_spec_vals, 'labels')[0], get_attr(h5_main.h5_spec_vals, 'units')[0], single_ao) # TODO: DO NOT assume simple 1D in positions! pos_dims = [ Dimension('X', 'm', np.linspace(0, scan_step_x_m, num_cols)), Dimension('Y', 'm', np.linspace(0, h5_main.h5_pos_vals[1, 0], h5_main.shape[0])) ] h5_group = create_results_group(h5_main, 'Reshape') # TODO: Create empty datasets and then write for very large datasets h5_resh = write_main_dataset(h5_group, (num_cols * h5_main.shape[0], pts_per_cycle), 'Reshaped_Data', get_attr(h5_main, 'quantity')[0], get_attr(h5_main, 'units')[0], pos_dims, spec_dims, chunks=(10, pts_per_cycle), dtype=h5_main.dtype, compression=h5_main.compression) # TODO: DON'T write in one shot assuming small datasets fit in memory! print('Starting to reshape G-mode line data. Please be patient') h5_resh[()] = np.reshape(h5_main[()], (-1, pts_per_cycle)) print('Finished reshaping G-mode line data to rows and columns') return USIDataset(h5_resh)
def plot_cluster_h5_group(h5_group, labels_kwargs=None, centroids_kwargs=None): """ Plots the cluster labels and mean response for each cluster Parameters ---------- h5_group : h5py.Datagroup object H5 group containing the labels and mean response labels_kwargs : dict, optional keyword arguments for the labels plot. NOT enabled yet. centroids_kwargs : dict, optional keyword arguments for the centroids plot. NOT enabled yet. Returns ------- fig_labels : figure handle Figure containing the labels fig_centroids : figure handle Figure containing the centroids """ if not isinstance(h5_group, h5py.Group): raise TypeError('h5_group should be a h5py.Group') h5_labels = USIDataset(h5_group['Labels']) h5_centroids = USIDataset(h5_group['Mean_Response']) labels_mat = np.squeeze(h5_labels.get_n_dim_form()) if labels_mat.ndim > 3: print('Unable to visualize 4 or more dimensional labels!') if labels_mat.ndim == 1: fig_labs, axis_labs = plt.subplots(figsize=(5.5, 5)) axis_labs.plot(h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]), labels_mat) axis_labs.set_xlabel(h5_labels.pos_dim_descriptors[0]) axis_labs.set_ylabel('Cluster index') axis_labs.set_title( get_attr(h5_group, 'cluster_algorithm') + ' Labels') elif labels_mat.ndim == 2: fig_labs, axis_labs = plot_cluster_labels( labels_mat, num_clusters=h5_centroids.shape[0], x_label=h5_labels.pos_dim_descriptors[0], y_label=h5_labels.pos_dim_descriptors[1], x_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]), y_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[1]), title=get_attr(h5_group, 'cluster_algorithm') + ' Labels') # TODO: probably not a great idea to load the entire dataset to memory centroids_mat = h5_centroids.get_n_dim_form() if len(h5_centroids.spec_dim_labels) == 1: legend_mode = 2 if h5_centroids.shape[0] < 6: legend_mode = 1 fig_cent, axis_cent = plot_cluster_centroids( centroids_mat, h5_centroids.get_spec_values(h5_centroids.spec_dim_labels[0]), legend_mode=legend_mode, x_label=h5_centroids.spec_dim_descriptors[0], y_label=h5_centroids.data_descriptor, overlayed=h5_centroids.shape[0] < 6, title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid', amp_units=get_attr(h5_centroids, 'units')) elif len(h5_centroids.spec_dim_labels) == 2: # stack of spectrograms if h5_centroids.dtype in [np.complex64, np.complex128, np.complex]: fig_cent, axis_cent = plot_complex_spectra( centroids_mat, subtitle_prefix='Cluster', title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid', x_label=h5_centroids.spec_dim_descriptors[0], y_label=h5_centroids.spec_dim_descriptors[1], amp_units=get_attr(h5_centroids, 'units')) else: fig_cent, axis_cent = plot_map_stack( centroids_mat, color_bar_mode='each', evenly_spaced=True, title='Cluster', heading=get_attr(h5_group, 'cluster_algorithm') + ' Centroid') return fig_labs, fig_cent
def check_if_main(h5_main, verbose=False): """ Checks the input dataset to see if it has all the necessary features to be considered a Main dataset. This means it is dataset has dimensions of correct size and has the following attributes: * quantity * units * main_data_name * data_type * modality * source In addition, the shapes of the ancillary matrices should match with that of h5_main Parameters ---------- h5_main : HDF5 Dataset Dataset of interest verbose : Boolean (Optional. Default = False) Whether or not to print statements Returns ------- success : Boolean True if all tests pass """ if not isinstance(h5_main, h5py.Dataset): if verbose: print('{} is not an HDF5 Dataset object.'.format(h5_main)) return False number_of_dims = 0 for dim in h5_main.dims: if np.array(dim.values()).size > 0: number_of_dims += 1 if len(h5_main.shape) != number_of_dims: if verbose: print('Main data does not have full set of dimension scales. ' 'Provided object has shape: {} but only {} dimensional ' 'scales'.format(h5_main.shape, len(h5_main.dims))) return False # h5_name = h5_main.name.split('/')[-1] h5_group = h5_main.parent # success = True # Check for Datasets attrs_names = ['dimension_type', 'name', 'quantity', 'units'] # Check for all required attributes in dataset main_attrs_names = [ 'quantity', 'units', 'main_data_name', 'pyNSID_version', 'data_type', 'modality', 'source' ] main_attr_success = np.all( [att in h5_main.attrs for att in main_attrs_names]) if verbose: print('All Attributes in dataset: ', main_attr_success) if not main_attr_success: if verbose: print('{} does not have the mandatory attributes'.format( h5_main.name)) return False for attr_name in main_attrs_names: val = get_attr(h5_main, attr_name) if not isinstance(val, (str, unicode)): if verbose: print('Attribute {} of {} found to be {}. Expected a string'. format(attr_name, h5_main.name, val)) return False length_success = [] dset_success = [] attr_success = [] # Check for Validity of Dimensional Scales for i, dimension in enumerate(h5_main.dims): # check for all required attributes h5_dim_dset = h5_group[dimension.label] attr_success.append( np.all([att in h5_dim_dset.attrs for att in attrs_names])) dset_success.append( np.all([attr_success, isinstance(h5_dim_dset, h5py.Dataset)])) # dimensional scale has to be 1D if len(h5_dim_dset.shape) == 1: # and of the same length as the shape of the dataset length_success.append(h5_main.shape[i] == h5_dim_dset.shape[0]) else: length_success.append(False) # We have the list now and can get error messages according to which dataset is bad or not. if np.all( [np.all(attr_success), np.all(length_success), np.all(dset_success)]): if verbose: print('Dimensions: All Attributes: ', np.all(attr_success)) print('Dimensions: All Correct Length: ', np.all(length_success)) print('Dimensions: All h5 Datasets: ', np.all(dset_success)) else: if False in length_success: print('length of dimension scale {} is wrong'.format( length_success.index(False))) if False in attr_success: print('attributes in dimension scale {} are wrong'.format( attr_success.index(False))) if False in dset_success: print('dimension scale {} is not a dataset'.format( dset_success.index(False))) return False return main_attr_success