def validate_aux_dset_pair(test_class, h5_group, h5_inds, h5_vals, dim_names, dim_units, inds_matrix, vals_matrix=None, base_name=None, h5_main=None, is_spectral=True): if vals_matrix is None: vals_matrix = inds_matrix if base_name is None: if is_spectral: base_name = 'Spectroscopic' else: base_name = 'Position' else: test_class.assertIsInstance(base_name, (str, unicode)) for h5_dset, exp_dtype, exp_name, ref_data in zip([h5_inds, h5_vals], [INDICES_DTYPE, VALUES_DTYPE], [base_name + '_Indices', base_name + '_Values'], [inds_matrix, vals_matrix]): if isinstance(h5_main, h5py.Dataset): test_class.assertEqual(h5_main.file[h5_main.attrs[exp_name]], h5_dset) test_class.assertIsInstance(h5_dset, h5py.Dataset) test_class.assertEqual(h5_dset.parent, h5_group) test_class.assertEqual(h5_dset.name.split('/')[-1], exp_name) test_class.assertTrue(np.allclose(ref_data, h5_dset[()])) test_class.assertEqual(h5_dset.dtype, exp_dtype) test_class.assertTrue(np.all([_ in h5_dset.attrs.keys() for _ in ['labels', 'units']])) test_class.assertTrue(np.all([x == y for x, y in zip(dim_names, get_attr(h5_dset, 'labels'))])) test_class.assertTrue(np.all([x == y for x, y in zip(dim_units, get_attr(h5_dset, 'units'))])) # assert region references for dim_ind, curr_name in enumerate(dim_names): expected = np.squeeze(ref_data[:, dim_ind]) if is_spectral: expected = np.squeeze(ref_data[dim_ind]) test_class.assertTrue(np.allclose(expected, np.squeeze(h5_dset[h5_dset.attrs[curr_name]])))
def setUp(self): data_utils.make_beps_file() self.orig_labels_order = ['X', 'Y', 'Cycle', 'Bias'] self.h5_file = h5py.File(data_utils.std_beps_path, mode='r') h5_grp = self.h5_file['/Raw_Measurement/'] self.source_nd_s2f = h5_grp['n_dim_form'][()] self.source_nd_f2s = self.source_nd_s2f.transpose(1, 0, 3, 2) self.h5_source = USIDataset(h5_grp['source_main']) self.pos_dims=[] self.spec_dims=[] for dim_name, dim_units in zip(self.h5_source.pos_dim_labels, hdf_utils.get_attr(self.h5_source.h5_pos_inds, 'units')): self.pos_dims.append( Dimension(dim_name, dim_units, h5_grp[dim_name][()])) for dim_name, dim_units in zip(self.h5_source.spec_dim_labels, hdf_utils.get_attr(self.h5_source.h5_spec_inds, 'units')): self.spec_dims.append( Dimension(dim_name, dim_units, h5_grp[dim_name][()])) res_grp_0 = h5_grp['source_main-Fitter_000'] self.results_0_nd_s2f = res_grp_0['n_dim_form'][()] self.results_0_nd_f2s = self.results_0_nd_s2f.transpose(1, 0, 3, 2) self.h5_compound = USIDataset(res_grp_0['results_main']) res_grp_1 = h5_grp['source_main-Fitter_001'] self.results_1_nd_s2f = res_grp_1['n_dim_form'][()] self.results_1_nd_f2s = self.results_1_nd_s2f.transpose(1, 0, 3, 2) self.h5_complex = USIDataset(res_grp_1['results_main'])
def test_group_indexing_sequential(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: writer = HDFwriter(h5_f) micro_group_0 = VirtualGroup('Test_', attrs={ 'att_1': 'string_val', 'att_2': 1.2345 }) [h5_group_0] = writer.write(micro_group_0) _ = writer.write(VirtualGroup('blah')) self.assertIsInstance(h5_group_0, h5py.Group) self.assertEqual(h5_group_0.name, '/Test_000') for key, expected_val in micro_group_0.attrs.items(): self.assertTrue( np.all(get_attr(h5_group_0, key) == expected_val)) micro_group_1 = VirtualGroup('Test_', attrs={ 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3'] }) [h5_group_1] = writer.write(micro_group_1) self.assertIsInstance(h5_group_1, h5py.Group) self.assertEqual(h5_group_1.name, '/Test_001') for key, expected_val in micro_group_1.attrs.items(): self.assertTrue( np.all(get_attr(h5_group_1, key) == expected_val)) os.remove(file_path)
def test_group_indexing_simultaneous(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: micro_group_0 = VirtualGroup('Test_', attrs = {'att_1': 'string_val', 'att_2': 1.2345}) micro_group_1 = VirtualGroup('Test_', attrs={'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3']}) root_group = VirtualGroup('', children=[VirtualGroup('blah'), micro_group_0, VirtualGroup('meh'), micro_group_1]) writer = HDFwriter(h5_f) h5_refs_list = writer.write(root_group) [h5_group_1] = get_h5_obj_refs(['Test_001'], h5_refs_list) [h5_group_0] = get_h5_obj_refs(['Test_000'], h5_refs_list) self.assertIsInstance(h5_group_0, h5py.Group) self.assertEqual(h5_group_0.name, '/Test_000') for key, expected_val in micro_group_0.attrs.items(): self.assertTrue(np.all(get_attr(h5_group_0, key) == expected_val)) self.assertIsInstance(h5_group_1, h5py.Group) self.assertEqual(h5_group_1.name, '/Test_001') for key, expected_val in micro_group_1.attrs.items(): self.assertTrue(np.all(get_attr(h5_group_1, key) == expected_val)) os.remove(file_path)
def test_write_dset_under_root(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: writer = HDFwriter(h5_f) data = np.random.rand(5, 7) attrs = {'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3'], 'labels': {'even_rows': (slice(0, None, 2), slice(None)), 'odd_rows': (slice(1, None, 2), slice(None))} } micro_dset = VirtualDataset('test', data) micro_dset.attrs = attrs.copy() [h5_dset] = writer.write(micro_dset) self.assertIsInstance(h5_dset, h5py.Dataset) reg_ref = attrs.pop('labels') self.assertEqual(len(h5_dset.attrs), len(attrs) + 1 + len(reg_ref)) for key, expected_val in attrs.items(): self.assertTrue(np.all(get_attr(h5_dset, key) == expected_val)) self.assertTrue(np.all([x in list(reg_ref.keys()) for x in get_attr(h5_dset, 'labels')])) expected_data = [data[:None:2], data[1:None:2]] written_data = [h5_dset[h5_dset.attrs['even_rows']], h5_dset[h5_dset.attrs['odd_rows']]] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) os.remove(file_path)
def reshape_from_lines_to_pixels(h5_main, pts_per_cycle, scan_step_x_m=None): """ Breaks up the provided raw G-mode dataset into lines and pixels (from just lines) Parameters ---------- h5_main : h5py.Dataset object Reference to the main dataset that contains the raw data that is only broken up by lines pts_per_cycle : unsigned int Number of points in a single pixel scan_step_x_m : float Step in meters for pixels Returns ------- h5_resh : h5py.Dataset object Reference to the main dataset that contains the reshaped data """ if not check_if_main(h5_main): raise TypeError('h5_main is not a Main dataset') h5_main = USIDataset(h5_main) if pts_per_cycle % 1 != 0 or pts_per_cycle < 1: raise TypeError('pts_per_cycle should be a positive integer') if scan_step_x_m is not None: if not isinstance(scan_step_x_m, Number): raise TypeError('scan_step_x_m should be a real number') else: scan_step_x_m = 1 if h5_main.shape[1] % pts_per_cycle != 0: warn('Error in reshaping the provided dataset to pixels. Check points per pixel') raise ValueError num_cols = int(h5_main.shape[1] / pts_per_cycle) # TODO: DO NOT assume simple 1 spectral dimension! single_ao = np.squeeze(h5_main.h5_spec_vals[:, :pts_per_cycle]) spec_dims = Dimension(get_attr(h5_main.h5_spec_vals, 'labels')[0], get_attr(h5_main.h5_spec_vals, 'units')[0], single_ao) # TODO: DO NOT assume simple 1D in positions! pos_dims = [Dimension('X', 'm', np.linspace(0, scan_step_x_m, num_cols)), Dimension('Y', 'm', np.linspace(0, h5_main.h5_pos_vals[1, 0], h5_main.shape[0]))] h5_group = create_results_group(h5_main, 'Reshape') # TODO: Create empty datasets and then write for very large datasets h5_resh = write_main_dataset(h5_group, (num_cols * h5_main.shape[0], pts_per_cycle), 'Reshaped_Data', get_attr(h5_main, 'quantity')[0], get_attr(h5_main, 'units')[0], pos_dims, spec_dims, chunks=(10, pts_per_cycle), dtype=h5_main.dtype, compression=h5_main.compression) # TODO: DON'T write in one shot assuming small datasets fit in memory! print('Starting to reshape G-mode line data. Please be patient') h5_resh[()] = np.reshape(h5_main[()], (-1, pts_per_cycle)) print('Finished reshaping G-mode line data to rows and columns') return USIDataset(h5_resh)
def _write_results_chunk(self): """ Writes the labels and mean response to the h5 file Returns --------- h5_group : HDF5 Group reference Reference to the group that contains the clustering results """ print('Writing clustering results to file.') num_clusters = self.__mean_resp.shape[0] h5_cluster_group = create_results_group(self.h5_main, self.process_name) write_simple_attrs(h5_cluster_group, self.parms_dict) h5_cluster_group.attrs['last_pixel'] = self.h5_main.shape[0] h5_labels = write_main_dataset(h5_cluster_group, np.uint32(self.__labels.reshape([-1, 1])), 'Labels', 'Cluster ID', 'a. u.', None, Dimension('Cluster', 'ID', 1), h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, aux_spec_prefix='Cluster_', dtype=np.uint32) if self.num_comps != self.h5_main.shape[1]: ''' Setup the Spectroscopic Indices and Values for the Mean Response if we didn't use all components Note that a sliced spectroscopic matrix may not be contiguous. Let's just lose the spectroscopic data for now until a better method is figured out ''' """ if isinstance(self.data_slice[1], np.ndarray): centroid_vals_mat = h5_centroids.h5_spec_vals[self.data_slice[1].tolist()] else: centroid_vals_mat = h5_centroids.h5_spec_vals[self.data_slice[1]] ds_centroid_values.data[0, :] = centroid_vals_mat """ if isinstance(self.data_slice[1], np.ndarray): vals_slice = self.data_slice[1].tolist() else: vals_slice = self.data_slice[1] vals = self.h5_main.h5_spec_vals[:, vals_slice].squeeze() new_spec = Dimension('Original_Spectral_Index', 'a.u.', vals) h5_inds, h5_vals = write_ind_val_dsets(h5_cluster_group, new_spec, is_spectral=True) else: h5_inds = self.h5_main.h5_spec_inds h5_vals = self.h5_main.h5_spec_vals # For now, link centroids with default spectroscopic indices and values. h5_centroids = write_main_dataset(h5_cluster_group, self.__mean_resp, 'Mean_Response', get_attr(self.h5_main, 'quantity')[0], get_attr(self.h5_main, 'units')[0], Dimension('Cluster', 'a. u.', np.arange(num_clusters)), None, h5_spec_inds=h5_inds, aux_pos_prefix='Mean_Resp_Pos_', h5_spec_vals=h5_vals) return h5_cluster_group
def _is_legal(self, h5_main, variables=None): """ Checks whether or not the provided object can be analyzed by this class. Parameters ---------- h5_main : h5py.Dataset instance The dataset containing the SHO Fit (not necessarily the dataset directly resulting from SHO fit) over which the loop projection, guess, and fit will be performed. variables : list(string) The dimensions needed to be present in the attributes of h5_main to analyze the data with Model. Returns ------- legal : Boolean Whether or not this dataset satisfies the necessary conditions for analysis """ if variables is None: variables = ['DC_Offset'] file_data_type = get_attr(h5_main.file, 'data_type') meas_grp_name = h5_main.name.split('/') h5_meas_grp = h5_main.file[meas_grp_name[1]] meas_data_type = get_attr(h5_meas_grp, 'data_type') if h5_main.dtype != sho32: warn('Provided dataset is not a SHO results dataset.') return False # This check is clunky but should account for case differences. If Python2 support is dropped, simplify with # single check using casefold. if not (meas_data_type.lower != file_data_type.lower or meas_data_type.upper != file_data_type.upper): warn('Mismatch between file and Measurement group data types for the chosen dataset.') print('File data type is {}. The data type for Measurement group {} is {}'.format(file_data_type, h5_meas_grp.name, meas_data_type)) return False if file_data_type == 'BEPSData': if get_attr(h5_meas_grp, 'VS_mode') not in ['DC modulation mode', 'current mode']: warn('Provided dataset is not a DC modulation or current mode BEPS dataset') return False elif get_attr(h5_meas_grp, 'VS_cycle_fraction') != 'full': warn('Provided dataset does not have full cycles') return False elif file_data_type == 'cKPFMData': if get_attr(h5_meas_grp, 'VS_mode') != 'cKPFM': warn('Provided dataset has an unsupported VS_mode.') return False return super(BELoopFitter, self)._is_legal(h5_main, variables)
def test_dependent_dim(self): with h5py.File(data_utils.relaxation_path, mode='r') as h5_f: h5_inds = h5_f[ '/Measurement_000/Channel_000/Spectroscopic_Indices'] h5_vals = h5_f['/Measurement_000/Channel_000/Spectroscopic_Values'] spec_dim_names = hdf_utils.get_attr(h5_inds, 'labels') ret_dict = hdf_utils.get_unit_values(h5_inds, h5_vals) for dim_ind, dim_name in enumerate(spec_dim_names): exp_val = hdf_utils.get_attr(h5_inds, 'unit_vals_dim_' + str(dim_ind)) act_val = ret_dict[dim_name] self.assertTrue(np.allclose(exp_val, act_val))
def test_write_dset_under_existing_group(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: writer = HDFwriter(h5_f) h5_g = writer._create_group(h5_f, VirtualGroup('test_group')) self.assertIsInstance(h5_g, h5py.Group) data = np.random.rand(5, 7) attrs = { 'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3'], 'labels': { 'even_rows': (slice(0, None, 2), slice(None)), 'odd_rows': (slice(1, None, 2), slice(None)) } } micro_dset = VirtualDataset('test', data, parent='/test_group') micro_dset.attrs = attrs.copy() [h5_dset] = writer.write(micro_dset) self.assertIsInstance(h5_dset, h5py.Dataset) self.assertEqual(h5_dset.parent, h5_g) reg_ref = attrs.pop('labels') self.assertEqual(len(h5_dset.attrs), len(attrs) + 1 + len(reg_ref)) for key, expected_val in attrs.items(): self.assertTrue(np.all(get_attr(h5_dset, key) == expected_val)) self.assertTrue( np.all([ x in list(reg_ref.keys()) for x in get_attr(h5_dset, 'labels') ])) expected_data = [data[:None:2], data[1:None:2]] written_data = [ h5_dset[h5_dset.attrs['even_rows']], h5_dset[h5_dset.attrs['odd_rows']] ] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) os.remove(file_path)
def get_all_dimensions(): pos_dims = [] spec_dims = [] with h5py.File(test_h5_file_path, mode='r') as h5_f: h5_raw_grp = h5_f['Raw_Measurement'] usi_main = USIDataset(h5_raw_grp['source_main']) for dim_name, dim_units in zip(usi_main.pos_dim_labels, hdf_utils.get_attr(usi_main.h5_pos_inds, 'units')): pos_dims.append(Dimension(dim_name, dim_units, h5_raw_grp[dim_name][()])) for dim_name, dim_units in zip(usi_main.spec_dim_labels, hdf_utils.get_attr( usi_main.h5_spec_inds, 'units')): spec_dims.append(Dimension(dim_name, dim_units, h5_raw_grp[dim_name][()])) return pos_dims, spec_dims
def _write_results_chunk(self): """ Writes the labels and mean response to the h5 file Returns --------- h5_group : HDF5 Group reference Reference to the group that contains the decomposition results """ h5_decomp_group = create_results_group(self.h5_main, self.process_name) write_simple_attrs(h5_decomp_group, self.parms_dict) write_simple_attrs(h5_decomp_group, {'n_components': self.__components.shape[0], 'n_samples': self.h5_main.shape[0], 'last_pixel': self.h5_main.shape[0]}) decomp_desc = Dimension('Endmember', 'a. u.', self.__components.shape[0]) # equivalent to V - compound / complex h5_components = write_main_dataset(h5_decomp_group, self.__components, 'Components', get_attr(self.h5_main, 'quantity')[0], 'a.u.', decomp_desc, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals) # equivalent of U - real h5_projections = write_main_dataset(h5_decomp_group, np.float32(self.__projection), 'Projection', 'abundance', 'a.u.', None, decomp_desc, dtype=np.float32, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals) # return the h5 group object self.h5_results_grp = h5_decomp_group return self.h5_results_grp
def _calc_raw(self): """ Returns ------- """ mem_per_pix = self.n_sho_bins * self.h5_sho_fit.dtype.itemsize + self.n_spec_bins * self.h5_raw.dtype.itemsize free_mem = self.max_ram batch_size = int(free_mem / mem_per_pix) batches = gen_batches(self.n_pixels, batch_size) w_vec = self.h5_spec_vals[get_attr(self.h5_spec_vals, 'Frequency')].squeeze() w_vec = w_vec[:self.n_bins] for pix_batch in batches: sho_chunk = self.h5_sho_fit[pix_batch, :].flatten() raw_data = np.zeros([sho_chunk.shape[0], self.n_bins], dtype=np.complex64) for iparm, sho_parms in enumerate(sho_chunk): raw_data[iparm, :] = SHOfunc(sho_parms, w_vec) self.h5_raw[pix_batch, :] = raw_data.reshape([-1, self.n_spec_bins]) self.h5_file.flush() return
def test_write_reg_ref_slice_dim_larger_than_data(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: writer = HDFwriter(h5_f) data = np.random.rand(5, 7) h5_dset = writer._create_simple_dset(h5_f, VirtualDataset('test', data)) self.assertIsInstance(h5_dset, h5py.Dataset) attrs = {'labels': {'even_rows': (slice(0, 15, 2), slice(None)), 'odd_rows': (slice(1, 15, 2), slice(None))}} writer._write_dset_attributes(h5_dset, attrs.copy()) h5_f.flush() # two atts point to region references. one for labels self.assertEqual(len(h5_dset.attrs), 1 + len(attrs['labels'])) # check if the labels attribute was written: self.assertTrue(np.all([x in list(attrs['labels'].keys()) for x in get_attr(h5_dset, 'labels')])) expected_data = [data[:None:2], data[1:None:2]] written_data = [h5_dset[h5_dset.attrs['even_rows']], h5_dset[h5_dset.attrs['odd_rows']]] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) os.remove(file_path)
def test_generate_and_write_reg_ref_legal(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: writer = HDFwriter(h5_f) data = np.random.rand(2, 7) h5_dset = writer._create_simple_dset(h5_f, VirtualDataset('test', data)) self.assertIsInstance(h5_dset, h5py.Dataset) attrs = {'labels': ['row_1', 'row_2']} if sys.version_info.major == 3: with self.assertWarns(UserWarning): writer._write_dset_attributes(h5_dset, attrs.copy()) else: writer._write_dset_attributes(h5_dset, attrs.copy()) h5_f.flush() # two atts point to region references. one for labels self.assertEqual(len(h5_dset.attrs), 1 + len(attrs['labels'])) # check if the labels attribute was written: self.assertTrue(np.all([x in list(attrs['labels']) for x in get_attr(h5_dset, 'labels')])) expected_data = [data[0], data[1]] written_data = [h5_dset[h5_dset.attrs['row_1']], h5_dset[h5_dset.attrs['row_2']]] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(np.squeeze(exp), np.squeeze(act))) os.remove(file_path)
def test_string_representation(self): usi_dset = self.h5_source h5_main = self.h5_file[usi_dset.name] actual = usi_dset.__repr__() actual = [line.strip() for line in actual.split("\n")] actual = [actual[line_ind] for line_ind in [0, 2, 4, 7, 8, 10, 11]] expected = list() expected.append(h5_main.__repr__()) expected.append(h5_main.name) expected.append(hdf_utils.get_attr(h5_main, "quantity") + " (" + hdf_utils.get_attr(h5_main, "units") + ")") for h5_inds in [usi_dset.h5_pos_inds, usi_dset.h5_spec_inds]: for dim_name, dim_size in zip(hdf_utils.get_attr(h5_inds, "labels"), hdf_utils.get_dimensionality(h5_inds)): expected.append(dim_name + ' - size: ' + str(dim_size)) self.assertTrue(np.all([x == y for x, y in zip(actual, expected)]))
def _check_for_old_guess(self): """ Returns a list of datasets where the same parameters have already been used to compute Guesses for this dataset Returns ------- list List of datasets with results from do_guess on this dataset """ groups = check_for_old(self.h5_main, self._fitter_name, new_parms=self._parms_dict, target_dset='Guess', verbose=self._verbose) datasets = [grp['Guess'] for grp in groups] # Now sort these datasets into partial and complete: completed_dsets = [] partial_dsets = [] for dset in datasets: try: last_pix = get_attr(dset, 'last_pixel') except KeyError: last_pix = None # Skip datasets without last_pixel attribute if last_pix is None: continue elif last_pix < self.h5_main.shape[0]: partial_dsets.append(dset) else: completed_dsets.append(dset) return partial_dsets, completed_dsets
def test_write_reg_ref_main_one_dim(self): file_path = 'test.h5' data_utils.delete_existing_file(file_path) data = np.random.rand(7) with h5py.File(file_path, mode='w') as h5_f: h5_dset = h5_f.create_dataset('Main', data=data) reg_refs = { 'even_rows': (slice(0, None, 2)), 'odd_rows': (slice(1, None, 2)) } reg_ref.write_region_references(h5_dset, reg_refs, add_labels_attr=True) self.assertEqual(len(h5_dset.attrs), 1 + len(reg_refs)) actual = get_attr(h5_dset, 'labels') self.assertTrue( np.all([ x == y for x, y in zip(actual, ['even_rows', 'odd_rows']) ])) expected_data = [data[0:None:2], data[1:None:2]] written_data = [ h5_dset[h5_dset.attrs['even_rows']], h5_dset[h5_dset.attrs['odd_rows']] ] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) os.remove(file_path)
def _calc_raw(self): """ Returns ------- """ mem_per_pix = self.n_sho_bins * self.h5_sho_fit.dtype.itemsize + self.n_spec_bins * self.h5_raw.dtype.itemsize free_mem = self.max_ram batch_size = int(free_mem / mem_per_pix) batches = gen_batches(self.n_pixels, batch_size) w_vec = self.h5_spec_vals[get_attr(self.h5_spec_vals, 'Frequency')].squeeze() w_vec = w_vec[:self.n_bins] for pix_batch in batches: sho_chunk = self.h5_sho_fit[pix_batch, :].flatten() raw_data = np.zeros([sho_chunk.shape[0], self.n_bins], dtype=np.complex64) for iparm, sho_parms in enumerate(sho_chunk): raw_data[iparm, :] = SHOfunc(sho_parms, w_vec) self.h5_raw[pix_batch, :] = raw_data.reshape( [-1, self.n_spec_bins]) self.h5_file.flush() return
def _set_guess(self, h5_guess): """ Setup to run the fit on an existing guess dataset. Sets the attributes normally defined during do_guess. Parameters ---------- h5_guess : h5py.Dataset Dataset object containing the guesses """ ''' Get the Spectroscopic and Position datasets from `self.h5_main` ''' self._sho_spec_inds = self.h5_main.h5_spec_inds self._sho_spec_vals = self.h5_main.h5_spec_vals self._sho_pos_inds = self.h5_main.h5_pos_inds ''' Find the Spectroscopic index for the DC_Offset ''' fit_ind = np.argwhere(get_attr(self._sho_spec_vals, 'labels') == self._fit_dim_name).squeeze() self._fit_spec_index = fit_ind self._fit_offset_index = 1 + fit_ind ''' Get the group and projection datasets ''' self._h5_group = h5_guess.parent self.h5_projected_loops = self._h5_group['Projected_Loops'] self.h5_loop_metrics = self._h5_group['Loop_Metrics'] self._met_spec_inds = self._h5_group['Loop_Metrics_Indices'] self.h5_guess = h5_guess
def _check_for_old_guess(self): """ Returns a list of datasets where the same parameters have already been used to compute Guesses for this dataset Returns ------- list List of datasets with results from do_guess on this dataset """ groups = check_for_old(self.h5_main, self._fitter_name, new_parms=self._parms_dict, target_dset='Guess', verbose=self._verbose) datasets = [grp['Guess'] for grp in groups] # Now sort these datasets into partial and complete: completed_dsets = [] partial_dsets = [] for dset in datasets: try: last_pix = get_attr(dset, 'last_pixel') except KeyError: last_pix = None # Skip datasets without last_pixel attribute if last_pix is None: continue elif last_pix < self.h5_main.shape[0]: partial_dsets.append(dset) else: completed_dsets.append(dset) return partial_dsets, completed_dsets
def test_string_representation(self): with h5py.File(test_h5_file_path, mode='r') as h5_f: h5_main = h5_f['/Raw_Measurement/source_main'] usi_dset = USIDataset(h5_main) actual = usi_dset.__repr__() actual = [line.strip() for line in actual.split("\n")] actual = [actual[line_ind] for line_ind in [0, 2, 4, 7, 8, 10, 11]] expected = list() expected.append(h5_main.__repr__()) expected.append(h5_main.name) expected.append(hdf_utils.get_attr(h5_main, "quantity") + " (" + hdf_utils.get_attr(h5_main, "units") + ")") for h5_inds in [usi_dset.h5_pos_inds, usi_dset.h5_spec_inds]: for dim_name, dim_size in zip(hdf_utils.get_attr(h5_inds, "labels"), hdf_utils.get_dimensionality(h5_inds)): expected.append(dim_name + ' - size: ' + str(dim_size)) self.assertTrue(np.all([x == y for x, y in zip(actual, expected)]))
def test_simple_region_ref_copy(self): # based on test_hdf_writer.test_write_legal_reg_ref_multi_dim_data() file_path = 'test.h5' data_utils.delete_existing_file(file_path) with h5py.File(file_path, mode='w') as h5_f: data = np.random.rand(5, 7) h5_orig_dset = h5_f.create_dataset('test', data=data) self.assertIsInstance(h5_orig_dset, h5py.Dataset) attrs = { 'labels': { 'even_rows': (slice(0, None, 2), slice(None)), 'odd_rows': (slice(1, None, 2), slice(None)) } } data_utils.write_main_reg_refs(h5_orig_dset, attrs['labels']) h5_f.flush() # two atts point to region references. one for labels self.assertEqual(len(h5_orig_dset.attrs), 1 + len(attrs['labels'])) # check if the labels attribute was written: self.assertTrue( np.all([ x in list(attrs['labels'].keys()) for x in get_attr(h5_orig_dset, 'labels') ])) expected_data = [data[:None:2], data[1:None:2]] written_data = [ h5_orig_dset[h5_orig_dset.attrs['even_rows']], h5_orig_dset[h5_orig_dset.attrs['odd_rows']] ] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) # Now write a new dataset without the region reference: h5_new_dset = h5_f.create_dataset('other', data=data) self.assertIsInstance(h5_orig_dset, h5py.Dataset) h5_f.flush() for key in attrs['labels'].keys(): reg_ref.simple_region_ref_copy(h5_orig_dset, h5_new_dset, key) # now check to make sure that this dataset also has the same region references: written_data = [ h5_new_dset[h5_new_dset.attrs['even_rows']], h5_new_dset[h5_new_dset.attrs['odd_rows']] ] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) os.remove(file_path)
def __init__(self, h5_main, **kwargs): super(BELoopProjector, self).__init__(h5_main, **kwargs) if 'DC_Offset' in self.h5_main.spec_dim_labels: self._fit_dim_name = 'DC_Offset' elif 'write_bias' in self.h5_main.spec_dim_labels: self._fit_dim_name = 'write_bias' else: raise ValueError('Neither "DC_Offset", nor "write_bias" were ' 'spectroscopic dimension in the provided dataset ' 'which has dimensions: {}' '.'.format(self.h5_main.spec_dim_labels)) if 'FORC' in self.h5_main.spec_dim_labels: self._forc_dim_name = 'FORC' else: self._forc_dim_name = 'FORC_Cycle' # TODO: Need to catch KeyError s that would be thrown when attempting to access attributes file_data_type = get_attr(h5_main.file, 'data_type') meas_grp_name = h5_main.name.split('/') h5_meas_grp = h5_main.file[meas_grp_name[1]] meas_data_type = get_attr(h5_meas_grp, 'data_type') if h5_main.dtype != sho32: raise TypeError('Provided dataset is not a SHO results dataset.') # This check is clunky but should account for case differences. # If Python2 support is dropped, simplify with# single check using case if not (meas_data_type.lower != file_data_type.lower or meas_data_type.upper != file_data_type.upper): message = 'Mismatch between file and Measurement group data types for the chosen dataset.\n' message += 'File data type is {}. The data type for Measurement group {} is {}'.format( file_data_type, h5_meas_grp.name, meas_data_type) raise ValueError(message) if file_data_type == 'BEPSData': if get_attr(h5_meas_grp, 'VS_mode') not in [ 'DC modulation mode', 'current mode' ]: raise ValueError( 'Provided dataset has a mode: "' + get_attr(h5_meas_grp, 'VS_mode') + '" is not a ' '"DC modulation" or "current mode" BEPS dataset') elif get_attr(h5_meas_grp, 'VS_cycle_fraction') != 'full': raise ValueError('Provided dataset does not have full cycles') elif file_data_type == 'cKPFMData': if get_attr(h5_meas_grp, 'VS_mode') != 'cKPFM': raise ValueError( 'Provided dataset has an unsupported VS_mode: "' + get_attr(h5_meas_grp, 'VS_mode') + '"') # ##################################################################### self.process_name = "Loop_Projection" self.parms_dict = {'projection_method': 'pycroscopy BE loop model'}
def test_get_indices_for_region_ref_corners(self): with h5py.File(data_utils.std_beps_path, mode='r') as h5_f: h5_main = h5_f['/Raw_Measurement/source_main'] ref_in = get_attr(h5_main, 'even_rows') ret_val = reg_ref.get_indices_for_region_ref( h5_main, ref_in, 'corners') expected_pos = np.repeat(np.arange(h5_main.shape[0])[::2], 2) expected_spec = np.tile(np.array([0, h5_main.shape[1] - 1]), expected_pos.size // 2) expected_corners = np.vstack((expected_pos, expected_spec)).T self.assertTrue(np.allclose(ret_val, expected_corners))
def test_get_indices_for_region_ref_slices(self): with h5py.File(data_utils.std_beps_path, mode='r') as h5_f: h5_main = h5_f['/Raw_Measurement/source_main'] ref_in = get_attr(h5_main, 'even_rows') ret_val = reg_ref.get_indices_for_region_ref( h5_main, ref_in, 'slices') spec_slice = slice(0, h5_main.shape[1] - 1, None) expected_slices = np.array( [[slice(x, x, None), spec_slice] for x in np.arange(h5_main.shape[0])[::2]]) self.assertTrue(np.all(ret_val == expected_slices))
def test_group_indexing_simultaneous(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: micro_group_0 = VirtualGroup('Test_', attrs={ 'att_1': 'string_val', 'att_2': 1.2345 }) micro_group_1 = VirtualGroup('Test_', attrs={ 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3'] }) root_group = VirtualGroup('', children=[ VirtualGroup('blah'), micro_group_0, VirtualGroup('meh'), micro_group_1 ]) writer = HDFwriter(h5_f) h5_refs_list = writer.write(root_group) [h5_group_1] = get_h5_obj_refs(['Test_001'], h5_refs_list) [h5_group_0] = get_h5_obj_refs(['Test_000'], h5_refs_list) self.assertIsInstance(h5_group_0, h5py.Group) self.assertEqual(h5_group_0.name, '/Test_000') for key, expected_val in micro_group_0.attrs.items(): self.assertTrue( np.all(get_attr(h5_group_0, key) == expected_val)) self.assertIsInstance(h5_group_1, h5py.Group) self.assertEqual(h5_group_1.name, '/Test_001') for key, expected_val in micro_group_1.attrs.items(): self.assertTrue( np.all(get_attr(h5_group_1, key) == expected_val)) os.remove(file_path)
def _write_results_chunk(self): """ Writes the provided SVD results to file Parameters ---------- """ comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s)) h5_svd_group = create_results_group(self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) self.h5_results_grp = h5_svd_group self._write_source_dset_provenance() write_simple_attrs(h5_svd_group, self.parms_dict) write_simple_attrs(h5_svd_group, {'svd_method': 'sklearn-randomized'}) h5_u = write_main_dataset(h5_svd_group, np.float32(self.__u), 'U', 'Abundance', 'a.u.', None, comp_dim, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, dtype=np.float32, chunks=calc_chunks(self.__u.shape, np.float32(0).itemsize)) # print(get_attr(self.h5_main, 'quantity')[0]) h5_v = write_main_dataset(h5_svd_group, self.__v, 'V', get_attr(self.h5_main, 'quantity')[0], 'a.u.', comp_dim, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals, chunks=calc_chunks(self.__v.shape, self.h5_main.dtype.itemsize)) # No point making this 1D dataset a main dataset h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s)) ''' Check h5_main for plot group references. Copy them into V if they exist ''' for key in self.h5_main.attrs.keys(): if '_Plot_Group' not in key: continue ref_inds = get_indices_for_region_ref(self.h5_main, self.h5_main.attrs[key], return_method='corners') ref_inds = ref_inds.reshape([-1, 2, 2]) ref_inds[:, 1, 0] = h5_v.shape[0] - 1 svd_ref = create_region_reference(h5_v, ref_inds) h5_v.attrs[key] = svd_ref # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = h5_svd_group.create_dataset(self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: h5_svd_group.attrs['last_pixel'] = self.h5_main.shape[0]
def test_sparse_samp_no_attr(self): # What should the user expect this function to do? throw an error. # Without the attribute, this function will have no idea that it is looking at a sparse sampling case # it will return the first and second columns of vals blindly with h5py.File(data_utils.sparse_sampling_path, mode='r') as h5_f: h5_inds = h5_f['/Measurement_000/Channel_000/Position_Indices'] h5_vals = h5_f['/Measurement_000/Channel_000/Position_Values'] dim_names = hdf_utils.get_attr(h5_inds, 'labels') ret_dict = hdf_utils.get_unit_values(h5_inds, h5_vals) for dim_ind, dim_name in enumerate(dim_names): exp_val = h5_vals[:, dim_ind] act_val = ret_dict[dim_name] self.assertTrue(np.allclose(exp_val, act_val))
def test_legal_03(self): attrs = { 'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3'] } with h5py.File(data_utils.std_beps_path, mode='r') as h5_f: h5_group = h5_f['/Raw_Measurement/source_main-Fitter_000'] for key, expected_value in attrs.items(): self.assertTrue( np.all( hdf_utils.get_attr(h5_group, key) == expected_value))
def test_np_array(self): file_path = 'test.h5' data_utils.delete_existing_file(file_path) with h5py.File(file_path, mode='w') as h5_f: attrs = {'att_1': np.random.rand(4)} hdf_utils.write_simple_attrs(h5_f, attrs) for key, expected_val in attrs.items(): self.assertTrue( np.all(hdf_utils.get_attr(h5_f, key) == expected_val)) os.remove(file_path)
def _write_results_chunk(self): """ Writes the provided SVD results to file Parameters ---------- """ comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s)) h5_svd_group = create_results_group(self.h5_main, self.process_name) self.h5_results_grp = h5_svd_group write_simple_attrs(h5_svd_group, self.parms_dict) write_simple_attrs(h5_svd_group, {'svd_method': 'sklearn-randomized'}) h5_u = write_main_dataset(h5_svd_group, np.float32(self.__u), 'U', 'Abundance', 'a.u.', None, comp_dim, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, dtype=np.float32, chunks=calc_chunks(self.__u.shape, np.float32(0).itemsize)) # print(get_attr(self.h5_main, 'quantity')[0]) h5_v = write_main_dataset(h5_svd_group, self.__v, 'V', get_attr(self.h5_main, 'quantity')[0], 'a.u.', comp_dim, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals, chunks=calc_chunks(self.__v.shape, self.h5_main.dtype.itemsize)) # No point making this 1D dataset a main dataset h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s)) ''' Check h5_main for plot group references. Copy them into V if they exist ''' for key in self.h5_main.attrs.keys(): if '_Plot_Group' not in key: continue ref_inds = get_indices_for_region_ref(self.h5_main, self.h5_main.attrs[key], return_method='corners') ref_inds = ref_inds.reshape([-1, 2, 2]) ref_inds[:, 1, 0] = h5_v.shape[0] - 1 svd_ref = create_region_reference(h5_v, ref_inds) h5_v.attrs[key] = svd_ref # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = h5_svd_group.create_dataset(self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: h5_svd_group.attrs['last_pixel'] = self.h5_main.shape[0]
def _create_guess_datasets(self): """ Creates the h5 group, guess dataset, corresponding spectroscopic datasets and also links the guess dataset to the spectroscopic datasets. """ self.h5_results_grp = create_results_group( self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) write_simple_attrs(self.h5_results_grp, self.parms_dict) # If writing to a new HDF5 file: # Add back the data_type attribute - still being used in the visualizer if self.h5_results_grp.file != self.h5_main.file: write_simple_attrs( self.h5_results_grp.file, {'data_type': get_attr(self.h5_main.file, 'data_type')}) ret_vals = write_reduced_anc_dsets(self.h5_results_grp, self.h5_main.h5_spec_inds, self.h5_main.h5_spec_vals, self._fit_dim_name, verbose=self.verbose) h5_sho_inds, h5_sho_vals = ret_vals self._h5_guess = write_main_dataset( self.h5_results_grp, (self.h5_main.shape[0], self.num_udvs_steps), 'Guess', 'SHO', 'compound', None, None, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, h5_spec_inds=h5_sho_inds, h5_spec_vals=h5_sho_vals, chunks=(1, self.num_udvs_steps), dtype=sho32, main_dset_attrs=self.parms_dict, verbose=self.verbose) # Does not make sense to propagate region refs - nobody uses them # copy_region_refs(self.h5_main, self._h5_guess) self._h5_guess.file.flush() if self.verbose and self.mpi_rank == 0: print('Finished creating Guess dataset')
def test_group_indexing_sequential(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: writer = HDFwriter(h5_f) micro_group_0 = VirtualGroup('Test_', attrs={'att_1': 'string_val', 'att_2': 1.2345}) [h5_group_0] = writer.write(micro_group_0) _ = writer.write(VirtualGroup('blah')) self.assertIsInstance(h5_group_0, h5py.Group) self.assertEqual(h5_group_0.name, '/Test_000') for key, expected_val in micro_group_0.attrs.items(): self.assertTrue(np.all(get_attr(h5_group_0, key) == expected_val)) micro_group_1 = VirtualGroup('Test_', attrs={'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3']}) [h5_group_1] = writer.write(micro_group_1) self.assertIsInstance(h5_group_1, h5py.Group) self.assertEqual(h5_group_1.name, '/Test_001') for key, expected_val in micro_group_1.attrs.items(): self.assertTrue(np.all(get_attr(h5_group_1, key) == expected_val)) os.remove(file_path)
def test_write_single_group(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: attrs = {'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3']} micro_group = VirtualGroup('Test_') micro_group.attrs = attrs writer = HDFwriter(h5_f) [h5_group] = writer.write(micro_group) for key, expected_val in attrs.items(): self.assertTrue(np.all(get_attr(h5_group, key) == expected_val)) os.remove(file_path)
def _write_results_chunk(self): """ Writes the labels and mean response to the h5 file Returns --------- h5_group : HDF5 Group reference Reference to the group that contains the decomposition results """ h5_decomp_group = create_results_group(self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) self._write_source_dset_provenance() write_simple_attrs(h5_decomp_group, self.parms_dict) write_simple_attrs(h5_decomp_group, {'n_components': self.__components.shape[0], 'n_samples': self.h5_main.shape[0]}) decomp_desc = Dimension('Endmember', 'a. u.', self.__components.shape[0]) # equivalent to V - compound / complex h5_components = write_main_dataset(h5_decomp_group, self.__components, 'Components', get_attr(self.h5_main, 'quantity')[0], 'a.u.', decomp_desc, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals) # equivalent of U - real h5_projections = write_main_dataset(h5_decomp_group, np.float32(self.__projection), 'Projection', 'abundance', 'a.u.', None, decomp_desc, dtype=np.float32, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals) # return the h5 group object self.h5_results_grp = h5_decomp_group # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = h5_decomp_group.create_dataset(self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: h5_decomp_group.attrs['last_pixel'] = self.h5_main.shape[0] return self.h5_results_grp
def test_write_legal_reg_ref_multi_dim_data_2nd_dim(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: writer = HDFwriter(h5_f) data = np.random.rand(5, 3) h5_dset = writer._create_simple_dset(h5_f, VirtualDataset('test', data)) self.assertIsInstance(h5_dset, h5py.Dataset) attrs = { 'labels': { 'even_rows': (slice(None), slice(0, None, 2)), 'odd_rows': (slice(None), slice(1, None, 2)) } } writer._write_dset_attributes(h5_dset, attrs.copy()) h5_f.flush() # two atts point to region references. one for labels self.assertEqual(len(h5_dset.attrs), 1 + len(attrs['labels'])) # check if the labels attribute was written: self.assertTrue( np.all([ x in list(attrs['labels'].keys()) for x in get_attr(h5_dset, 'labels') ])) expected_data = [data[:, 0:None:2], data[:, 1:None:2]] written_data = [ h5_dset[h5_dset.attrs['even_rows']], h5_dset[h5_dset.attrs['odd_rows']] ] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) os.remove(file_path)
def test_write_single_group(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: attrs = { 'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3'] } micro_group = VirtualGroup('Test_') micro_group.attrs = attrs writer = HDFwriter(h5_f) [h5_group] = writer.write(micro_group) for key, expected_val in attrs.items(): self.assertTrue( np.all(get_attr(h5_group, key) == expected_val)) os.remove(file_path)
def test_to_grp(self): file_path = 'test.h5' data_utils.delete_existing_file(file_path) with h5py.File(file_path, mode='w') as h5_f: h5_group = h5_f.create_group('Blah') attrs = { 'att_1': 'string_val', 'att_2': 1.234, 'att_3': [1, 2, 3.14, 4], 'att_4': ['s', 'tr', 'str_3'] } hdf_utils.write_simple_attrs(h5_group, attrs) for key, expected_val in attrs.items(): self.assertTrue( np.all(hdf_utils.get_attr(h5_group, key) == expected_val)) os.remove(file_path)
def test_generate_and_write_reg_ref_legal(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: writer = HDFwriter(h5_f) data = np.random.rand(2, 7) h5_dset = writer._create_simple_dset(h5_f, VirtualDataset('test', data)) self.assertIsInstance(h5_dset, h5py.Dataset) attrs = {'labels': ['row_1', 'row_2']} if sys.version_info.major == 3: with self.assertWarns(UserWarning): writer._write_dset_attributes(h5_dset, attrs.copy()) else: writer._write_dset_attributes(h5_dset, attrs.copy()) h5_f.flush() # two atts point to region references. one for labels self.assertEqual(len(h5_dset.attrs), 1 + len(attrs['labels'])) # check if the labels attribute was written: self.assertTrue( np.all([ x in list(attrs['labels']) for x in get_attr(h5_dset, 'labels') ])) expected_data = [data[0], data[1]] written_data = [ h5_dset[h5_dset.attrs['row_1']], h5_dset[h5_dset.attrs['row_2']] ] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(np.squeeze(exp), np.squeeze(act))) os.remove(file_path)
def _write_results_chunk(self): """ Writes the labels and mean response to the h5 file Returns --------- h5_group : HDF5 Group reference Reference to the group that contains the decomposition results """ h5_decomp_group = create_results_group(self.h5_main, self.process_name) write_simple_attrs(h5_decomp_group, self.parms_dict) write_simple_attrs(h5_decomp_group, {'n_components': self.__components.shape[0], 'n_samples': self.h5_main.shape[0]}) decomp_desc = Dimension('Endmember', 'a. u.', self.__components.shape[0]) # equivalent to V - compound / complex h5_components = write_main_dataset(h5_decomp_group, self.__components, 'Components', get_attr(self.h5_main, 'quantity')[0], 'a.u.', decomp_desc, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals) # equivalent of U - real h5_projections = write_main_dataset(h5_decomp_group, np.float32(self.__projection), 'Projection', 'abundance', 'a.u.', None, decomp_desc, dtype=np.float32, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals) # return the h5 group object self.h5_results_grp = h5_decomp_group # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = h5_decomp_group.create_dataset(self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: h5_decomp_group.attrs['last_pixel'] = self.h5_main.shape[0] return self.h5_results_grp
def basic_file_validation(self, h5_f): self.assertEqual('ImageTranslator', hdf_utils.get_attr(h5_f, 'translator')) # First level should have absolutely nothing besides one group self.assertEqual(len(h5_f.items()), 1) self.assertTrue('Measurement_000' in h5_f.keys()) h5_meas_grp = h5_f['Measurement_000'] self.assertIsInstance(h5_meas_grp, h5py.Group) # Again, this group should only have one group - Channel_000 self.assertEqual(len(h5_meas_grp.items()), 1) self.assertTrue('Channel_000' in h5_meas_grp.keys()) h5_chan_grp = h5_meas_grp['Channel_000'] self.assertIsInstance(h5_chan_grp, h5py.Group) # This channel group is not expected to have any (custom) attributes but it will contain the main dataset self.assertEqual(len(h5_chan_grp.items()), 5) for dset_name in [ 'Raw_Data', 'Position_Indices', 'Position_Values', 'Spectroscopic_Indices', 'Spectroscopic_Values' ]: self.assertTrue(dset_name in h5_chan_grp.keys()) h5_dset = h5_chan_grp[dset_name] self.assertIsInstance(h5_dset, h5py.Dataset) usid_main = USIDataset(h5_chan_grp['Raw_Data']) self.assertIsInstance(usid_main, USIDataset) self.assertEqual(usid_main.name.split('/')[-1], 'Raw_Data') self.assertEqual(usid_main.parent, h5_chan_grp) validate_aux_dset_pair(self, h5_chan_grp, usid_main.h5_spec_inds, usid_main.h5_spec_vals, ['arb'], ['a.u.'], np.atleast_2d([0]), h5_main=usid_main, is_spectral=True)
def test_to_dset(self): file_path = 'test.h5' data_utils.delete_existing_file(file_path) with h5py.File(file_path, mode='w') as h5_f: h5_dset = h5_f.create_dataset('Test', data=np.arange(3)) attrs = { 'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3'] } hdf_utils.write_simple_attrs(h5_dset, attrs) self.assertEqual(len(h5_dset.attrs), len(attrs)) for key, expected_val in attrs.items(): self.assertTrue( np.all(hdf_utils.get_attr(h5_dset, key) == expected_val)) os.remove(file_path)
def translate(self, h5_path, force_patch=False, **kwargs): """ Add the needed references and attributes to the h5 file that are not created by the LabView data aquisition program. Parameters ---------- h5_path : str path to the h5 file force_patch : bool, optional Should the check to see if the file has already been patched be ignored. Default False. Returns ------- h5_file : h5py.File patched hdf5 file """ # Open the file and check if a patch is needed h5_file = h5py.File(os.path.abspath(h5_path), 'r+') if h5_file.attrs.get('translator') is not None and not force_patch: print('File is already Pycroscopy ready.') return h5_file ''' Get the list of all Raw_Data Datasets Loop over the list and update the needed attributes ''' raw_list = find_dataset(h5_file, 'Raw_Data') for h5_raw in raw_list: if 'quantity' not in h5_raw.attrs: h5_raw.attrs['quantity'] = 'quantity' if 'units' not in h5_raw.attrs: h5_raw.attrs['units'] = 'a.u.' # Grab the channel and measurement group of the data to check some needed attributes h5_chan = h5_raw.parent try: c_type = get_attr(h5_chan, 'channel_type') except KeyError: warn_str = "'channel_type' was not found as an attribute of {}.\n".format(h5_chan.name) warn_str += "If this is BEPS or BELine data from the LabView aquisition software, " + \ "please run the following piece of code. Afterwards, run this function again.\n" + \ "CODE: " \ "hdf.file['{}'].attrs['channel_type'] = 'BE'".format(h5_chan.name) warn(warn_str) return h5_file except: raise if c_type != 'BE': continue h5_meas = h5_chan.parent h5_meas.attrs['num_UDVS_steps'] = h5_meas.attrs['num_steps'] # Get the object handles for the Indices and Values datasets h5_pos_inds = h5_chan['Position_Indices'] h5_pos_vals = h5_chan['Position_Values'] h5_spec_inds = h5_chan['Spectroscopic_Indices'] h5_spec_vals = h5_chan['Spectroscopic_Values'] # Make sure we have correct spectroscopic indices for the given values ds_spec_inds = create_spec_inds_from_vals(h5_spec_vals[()]) if not np.allclose(ds_spec_inds, h5_spec_inds[()]): h5_spec_inds[:, :] = ds_spec_inds[:, :] h5_file.flush() # Get the labels and units for the Spectroscopic datasets h5_spec_labels = h5_spec_inds.attrs['labels'] inds_and_vals = [h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals] for dset in inds_and_vals: spec_labels = dset.attrs['labels'] try: spec_units = dset.attrs['units'] if len(spec_units) != len(spec_labels): raise KeyError except KeyError: dset['units'] = ['' for _ in spec_labels] except: raise for ilabel, label in enumerate(h5_spec_labels): label_slice = (slice(ilabel, ilabel + 1), slice(None)) if label == '': label = 'Step' h5_spec_inds.attrs[label] = h5_spec_inds.regionref[label_slice] h5_spec_vals.attrs[label] = h5_spec_vals.regionref[label_slice] # Link the references to the Indices and Values datasets to the Raw_Data link_as_main(h5_raw, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals) # Also link the Bin_Frequencies and Bin_Wfm_Type datasets h5_freqs = h5_chan['Bin_Frequencies'] aux_dset_names = ['Bin_Frequencies'] aux_dset_refs = [h5_freqs.ref] check_and_link_ancillary(h5_raw, aux_dset_names, anc_refs=aux_dset_refs) ''' Get all SHO_Fit groups for the Raw_Data and loop over them Get the Guess and Spectroscopic Datasets for each SHO_Fit group ''' sho_list = find_results_groups(h5_raw, 'SHO_Fit') for h5_sho in sho_list: h5_sho_guess = h5_sho['Guess'] h5_sho_spec_inds = h5_sho['Spectroscopic_Indices'] h5_sho_spec_vals = h5_sho['Spectroscopic_Values'] # Make sure we have correct spectroscopic indices for the given values ds_sho_spec_inds = create_spec_inds_from_vals(h5_sho_spec_inds[()]) if not np.allclose(ds_sho_spec_inds, h5_sho_spec_inds[()]): h5_sho_spec_inds[:, :] = ds_sho_spec_inds[:, :] # Get the labels and units for the Spectroscopic datasets h5_sho_spec_labels = get_attr(h5_sho_spec_inds, 'labels') link_as_main(h5_sho_guess, h5_pos_inds, h5_pos_vals, h5_sho_spec_inds, h5_sho_spec_vals) sho_inds_and_vals = [h5_sho_spec_inds, h5_sho_spec_vals] for dset in sho_inds_and_vals: spec_labels = get_attr(dset, 'labels') try: spec_units = get_attr(dset, 'units') if len(spec_units) != len(spec_labels): raise KeyError except KeyError: spec_units = [''.encode('utf-8') for _ in spec_labels] dset.attrs['units'] = spec_units except: raise # Make region references in the for ilabel, label in enumerate(h5_sho_spec_labels): label_slice = (slice(ilabel, ilabel + 1), slice(None)) if label == '': label = 'Step'.encode('utf-8') h5_sho_spec_inds.attrs[label] = h5_sho_spec_inds.regionref[label_slice] h5_sho_spec_vals.attrs[label] = h5_sho_spec_vals.regionref[label_slice] h5_file.flush() h5_file.attrs['translator'] = 'V3patcher'.encode('utf-8') return h5_file
def _write_results_chunk(self): """ Writes the labels and mean response to the h5 file Returns --------- h5_group : HDF5 Group reference Reference to the group that contains the clustering results """ print('Writing clustering results to file.') num_clusters = self.__mean_resp.shape[0] h5_cluster_group = create_results_group(self.h5_main, self.process_name) write_simple_attrs(h5_cluster_group, self.parms_dict) h5_labels = write_main_dataset(h5_cluster_group, np.uint32(self.__labels.reshape([-1, 1])), 'Labels', 'Cluster ID', 'a. u.', None, Dimension('Cluster', 'ID', 1), h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, aux_spec_prefix='Cluster_', dtype=np.uint32) if self.num_comps != self.h5_main.shape[1]: ''' Setup the Spectroscopic Indices and Values for the Mean Response if we didn't use all components Note that a sliced spectroscopic matrix may not be contiguous. Let's just lose the spectroscopic data for now until a better method is figured out ''' """ if isinstance(self.data_slice[1], np.ndarray): centroid_vals_mat = h5_centroids.h5_spec_vals[self.data_slice[1].tolist()] else: centroid_vals_mat = h5_centroids.h5_spec_vals[self.data_slice[1]] ds_centroid_values.data[0, :] = centroid_vals_mat """ if isinstance(self.data_slice[1], np.ndarray): vals_slice = self.data_slice[1].tolist() else: vals_slice = self.data_slice[1] vals = self.h5_main.h5_spec_vals[:, vals_slice].squeeze() new_spec = Dimension('Original_Spectral_Index', 'a.u.', vals) h5_inds, h5_vals = write_ind_val_dsets(h5_cluster_group, new_spec, is_spectral=True) else: h5_inds = self.h5_main.h5_spec_inds h5_vals = self.h5_main.h5_spec_vals # For now, link centroids with default spectroscopic indices and values. h5_centroids = write_main_dataset(h5_cluster_group, self.__mean_resp, 'Mean_Response', get_attr(self.h5_main, 'quantity')[0], get_attr(self.h5_main, 'units')[0], Dimension('Cluster', 'a. u.', np.arange(num_clusters)), None, h5_spec_inds=h5_inds, aux_pos_prefix='Mean_Resp_Pos_', h5_spec_vals=h5_vals) # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = h5_cluster_group.create_dataset(self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: h5_cluster_group.attrs['last_pixel'] = self.h5_main.shape[0] return h5_cluster_group
def rebuild_svd(h5_main, components=None, cores=None, max_RAM_mb=1024): """ Rebuild the Image from the SVD results on the windows Optionally, only use components less than n_comp. Parameters ---------- h5_main : hdf5 Dataset dataset which SVD was performed on components : {int, iterable of int, slice} optional Defines which components to keep Default - None, all components kept Input Types integer : Components less than the input will be kept length 2 iterable of integers : Integers define start and stop of component slice to retain other iterable of integers or slice : Selection of component indices to retain cores : int, optional How many cores should be used to rebuild Default - None, all but 2 cores will be used, min 1 max_RAM_mb : int, optional Maximum ammount of memory to use when rebuilding, in Mb. Default - 1024Mb Returns ------- rebuilt_data : HDF5 Dataset the rebuilt dataset """ comp_slice, num_comps = get_component_slice(components, total_components=h5_main.shape[1]) if isinstance(comp_slice, np.ndarray): comp_slice = list(comp_slice) dset_name = h5_main.name.split('/')[-1] # Ensuring that at least one core is available for use / 2 cores are available for other use max_cores = max(1, cpu_count() - 2) # print('max_cores',max_cores) if cores is not None: cores = min(round(abs(cores)), max_cores) else: cores = max_cores max_memory = min(max_RAM_mb * 1024 ** 2, 0.75 * get_available_memory()) if cores != 1: max_memory = int(max_memory / 2) ''' Get the handles for the SVD results ''' try: h5_svd_group = find_results_groups(h5_main, 'SVD')[-1] h5_S = h5_svd_group['S'] h5_U = h5_svd_group['U'] h5_V = h5_svd_group['V'] except KeyError: raise KeyError('SVD Results for {dset} were not found.'.format(dset=dset_name)) except: raise func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_V) ''' Calculate the size of a single batch that will fit in the available memory ''' n_comps = h5_S[comp_slice].size mem_per_pix = (h5_U.dtype.itemsize + h5_V.dtype.itemsize * h5_V.shape[1]) * n_comps fixed_mem = h5_main.size * h5_main.dtype.itemsize if cores is None: free_mem = max_memory - fixed_mem else: free_mem = max_memory * 2 - fixed_mem batch_size = int(round(float(free_mem) / mem_per_pix)) batch_slices = gen_batches(h5_U.shape[0], batch_size) print('Reconstructing in batches of {} positions.'.format(batch_size)) print('Batchs should be {} Mb each.'.format(mem_per_pix * batch_size / 1024.0 ** 2)) ''' Loop over all batches. ''' ds_V = np.dot(np.diag(h5_S[comp_slice]), func(h5_V[comp_slice, :])) rebuild = np.zeros((h5_main.shape[0], ds_V.shape[1])) for ibatch, batch in enumerate(batch_slices): rebuild[batch, :] += np.dot(h5_U[batch, comp_slice], ds_V) rebuild = stack_real_to_target_dtype(rebuild, h5_V.dtype) print('Completed reconstruction of data from SVD results. Writing to file.') ''' Create the Group and dataset to hold the rebuild data ''' rebuilt_grp = create_indexed_group(h5_svd_group, 'Rebuilt_Data') h5_rebuilt = write_main_dataset(rebuilt_grp, rebuild, 'Rebuilt_Data', get_attr(h5_main, 'quantity'), get_attr(h5_main, 'units'), None, None, h5_pos_inds=h5_main.h5_pos_inds, h5_pos_vals=h5_main.h5_pos_vals, h5_spec_inds=h5_main.h5_spec_inds, h5_spec_vals=h5_main.h5_spec_vals, chunks=h5_main.chunks, compression=h5_main.compression) if isinstance(comp_slice, slice): rebuilt_grp.attrs['components_used'] = '{}-{}'.format(comp_slice.start, comp_slice.stop) else: rebuilt_grp.attrs['components_used'] = components copy_attributes(h5_main, h5_rebuilt, skip_refs=False) h5_main.file.flush() print('Done writing reconstructed data to file.') return h5_rebuilt
def test_write_simple_tree(self): file_path = 'test.h5' self.__delete_existing_file(file_path) with h5py.File(file_path) as h5_f: inner_dset_data = np.random.rand(5, 7) inner_dset_attrs = {'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3'], 'labels': {'even_rows': (slice(0, None, 2), slice(None)), 'odd_rows': (slice(1, None, 2), slice(None))} } inner_dset = VirtualDataset('inner_dset', inner_dset_data) inner_dset.attrs = inner_dset_attrs.copy() attrs_inner_grp = {'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3']} inner_group = VirtualGroup('indexed_inner_group_') inner_group.attrs = attrs_inner_grp inner_group.add_children(inner_dset) outer_dset_data = np.random.rand(5, 7) outer_dset_attrs = {'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3'], 'labels': {'even_rows': (slice(0, None, 2), slice(None)), 'odd_rows': (slice(1, None, 2), slice(None))} } outer_dset = VirtualDataset('test', outer_dset_data, parent='/test_group') outer_dset.attrs = outer_dset_attrs.copy() attrs_outer_grp = {'att_1': 'string_val', 'att_2': 1.2345, 'att_3': [1, 2, 3, 4], 'att_4': ['str_1', 'str_2', 'str_3']} outer_group = VirtualGroup('unindexed_outer_group') outer_group.attrs = attrs_outer_grp outer_group.add_children([inner_group, outer_dset]) writer = HDFwriter(h5_f) h5_refs_list = writer.write(outer_group) # I don't know of a more elegant way to do this: [h5_outer_dset] = get_h5_obj_refs([outer_dset.name], h5_refs_list) [h5_inner_dset] = get_h5_obj_refs([inner_dset.name], h5_refs_list) [h5_outer_group] = get_h5_obj_refs([outer_group.name], h5_refs_list) [h5_inner_group] = get_h5_obj_refs(['indexed_inner_group_000'], h5_refs_list) self.assertIsInstance(h5_outer_dset, h5py.Dataset) self.assertIsInstance(h5_inner_dset, h5py.Dataset) self.assertIsInstance(h5_outer_group, h5py.Group) self.assertIsInstance(h5_inner_group, h5py.Group) # check assertions for the inner dataset first self.assertEqual(h5_inner_dset.parent, h5_inner_group) reg_ref = inner_dset_attrs.pop('labels') self.assertEqual(len(h5_inner_dset.attrs), len(inner_dset_attrs) + 1 + len(reg_ref)) for key, expected_val in inner_dset_attrs.items(): self.assertTrue(np.all(get_attr(h5_inner_dset, key) == expected_val)) self.assertTrue(np.all([x in list(reg_ref.keys()) for x in get_attr(h5_inner_dset, 'labels')])) expected_data = [inner_dset_data[:None:2], inner_dset_data[1:None:2]] written_data = [h5_inner_dset[h5_inner_dset.attrs['even_rows']], h5_inner_dset[h5_inner_dset.attrs['odd_rows']]] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) # check assertions for the inner data group next: self.assertEqual(h5_inner_group.parent, h5_outer_group) for key, expected_val in attrs_inner_grp.items(): self.assertTrue(np.all(get_attr(h5_inner_group, key) == expected_val)) # check the outer dataset next: self.assertEqual(h5_outer_dset.parent, h5_outer_group) reg_ref = outer_dset_attrs.pop('labels') self.assertEqual(len(h5_outer_dset.attrs), len(outer_dset_attrs) + 1 + len(reg_ref)) for key, expected_val in outer_dset_attrs.items(): self.assertTrue(np.all(get_attr(h5_outer_dset, key) == expected_val)) self.assertTrue(np.all([x in list(reg_ref.keys()) for x in get_attr(h5_outer_dset, 'labels')])) expected_data = [outer_dset_data[:None:2], outer_dset_data[1:None:2]] written_data = [h5_outer_dset[h5_outer_dset.attrs['even_rows']], h5_outer_dset[h5_outer_dset.attrs['odd_rows']]] for exp, act in zip(expected_data, written_data): self.assertTrue(np.allclose(exp, act)) # Finally check the outer group: self.assertEqual(h5_outer_group.parent, h5_f) for key, expected_val in attrs_outer_grp.items(): self.assertTrue(np.all(get_attr(h5_outer_group, key) == expected_val)) os.remove(file_path)
def _setup_h5(self, data_gen_parms): """ Setups up the hdf5 file structure before doing the actual generation Parameters ---------- data_gen_parms : dict Dictionary containing the parameters to write to the Measurement Group as attributes Returns ------- """ ''' Build the group structure down to the channel group ''' # Set up the basic group structure root_grp = VirtualGroup('') root_parms = generate_dummy_main_parms() root_parms['translator'] = 'FAKEBEPS' root_parms['data_type'] = data_gen_parms['data_type'] root_grp.attrs = root_parms meas_grp = VirtualGroup('Measurement_') chan_grp = VirtualGroup('Channel_') meas_grp.attrs.update(data_gen_parms) # Create the Position and Spectroscopic datasets for the Raw Data ds_pos_inds, ds_pos_vals, ds_spec_inds, ds_spec_vals = self._build_ancillary_datasets() raw_chunking = calc_chunks([self.n_pixels, self.n_spec_bins], np.complex64(0).itemsize, unit_chunks=[1, self.n_bins]) ds_raw_data = VirtualDataset('Raw_Data', data=None, maxshape=[self.n_pixels, self.n_spec_bins], dtype=np.complex64, compression='gzip', chunking=raw_chunking, parent=meas_grp) chan_grp.add_children([ds_pos_inds, ds_pos_vals, ds_spec_inds, ds_spec_vals, ds_raw_data]) meas_grp.add_children([chan_grp]) root_grp.add_children([meas_grp]) hdf = HDFwriter(self.h5_path) hdf.delete() h5_refs = hdf.write(root_grp) # Delete the MicroDatasets to save memory del ds_raw_data, ds_spec_inds, ds_spec_vals, ds_pos_inds, ds_pos_vals # Get the file and Raw_Data objects h5_raw = get_h5_obj_refs(['Raw_Data'], h5_refs)[0] h5_chan_grp = h5_raw.parent # Get the Position and Spectroscopic dataset objects h5_pos_inds = get_h5_obj_refs(['Position_Indices'], h5_refs)[0] h5_pos_vals = get_h5_obj_refs(['Position_Values'], h5_refs)[0] h5_spec_inds = get_h5_obj_refs(['Spectroscopic_Indices'], h5_refs)[0] h5_spec_vals = get_h5_obj_refs(['Spectroscopic_Values'], h5_refs)[0] # Link the Position and Spectroscopic datasets as attributes of Raw_Data link_as_main(h5_raw, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals) ''' Build the SHO Group ''' sho_grp = VirtualGroup('Raw_Data-SHO_Fit_', parent=h5_chan_grp.name) # Build the Spectroscopic datasets for the SHO Guess and Fit sho_spec_starts = np.where(h5_spec_inds[h5_spec_inds.attrs['Frequency']].squeeze() == 0)[0] sho_spec_labs = get_attr(h5_spec_inds, 'labels') ds_sho_spec_inds, ds_sho_spec_vals = build_reduced_spec_dsets(h5_spec_inds, h5_spec_vals, keep_dim=sho_spec_labs != 'Frequency', step_starts=sho_spec_starts) sho_chunking = calc_chunks([self.n_pixels, self.n_sho_bins], sho32.itemsize, unit_chunks=[1, 1]) ds_sho_fit = VirtualDataset('Fit', data=None, maxshape=[self.n_pixels, self.n_sho_bins], dtype=sho32, compression='gzip', chunking=sho_chunking, parent=sho_grp) ds_sho_guess = VirtualDataset('Guess', data=None, maxshape=[self.n_pixels, self.n_sho_bins], dtype=sho32, compression='gzip', chunking=sho_chunking, parent=sho_grp) sho_grp.add_children([ds_sho_fit, ds_sho_guess, ds_sho_spec_inds, ds_sho_spec_vals]) # Write the SHO group and datasets to the file and delete the MicroDataset objects h5_sho_refs = hdf.write(sho_grp) del ds_sho_fit, ds_sho_guess, ds_sho_spec_inds, ds_sho_spec_vals # Get the dataset handles for the fit and guess h5_sho_fit = get_h5_obj_refs(['Fit'], h5_sho_refs)[0] h5_sho_guess = get_h5_obj_refs(['Guess'], h5_sho_refs)[0] # Get the dataset handles for the SHO Spectroscopic datasets h5_sho_spec_inds = get_h5_obj_refs(['Spectroscopic_Indices'], h5_sho_refs)[0] h5_sho_spec_vals = get_h5_obj_refs(['Spectroscopic_Values'], h5_sho_refs)[0] # Link the Position and Spectroscopic datasets as attributes of the SHO Fit and Guess link_as_main(h5_sho_fit, h5_pos_inds, h5_pos_vals, h5_sho_spec_inds, h5_sho_spec_vals) link_as_main(h5_sho_guess, h5_pos_inds, h5_pos_vals, h5_sho_spec_inds, h5_sho_spec_vals) ''' Build the loop group ''' loop_grp = VirtualGroup('Fit-Loop_Fit_', parent=h5_sho_fit.parent.name) # Build the Spectroscopic datasets for the loops loop_spec_starts = np.where(h5_sho_spec_inds[h5_sho_spec_inds.attrs['DC_Offset']].squeeze() == 0)[0] loop_spec_labs = get_attr(h5_sho_spec_inds, 'labels') ds_loop_spec_inds, ds_loop_spec_vals = build_reduced_spec_dsets(h5_sho_spec_inds, h5_sho_spec_vals, keep_dim=loop_spec_labs != 'DC_Offset', step_starts=loop_spec_starts) # Create the loop fit and guess MicroDatasets loop_chunking = calc_chunks([self.n_pixels, self.n_loops], loop_fit32.itemsize, unit_chunks=[1, 1]) ds_loop_fit = VirtualDataset('Fit', data=None, maxshape=[self.n_pixels, self.n_loops], dtype=loop_fit32, compression='gzip', chunking=loop_chunking, parent=loop_grp) ds_loop_guess = VirtualDataset('Guess', data=None, maxshape=[self.n_pixels, self.n_loops], dtype=loop_fit32, compression='gzip', chunking=loop_chunking, parent=loop_grp) # Add the datasets to the loop group then write it to the file loop_grp.add_children([ds_loop_fit, ds_loop_guess, ds_loop_spec_inds, ds_loop_spec_vals]) h5_loop_refs = hdf.write(loop_grp) # Delete the MicroDatasets del ds_loop_spec_vals, ds_loop_spec_inds, ds_loop_guess, ds_loop_fit # Get the handles to the datasets h5_loop_fit = get_h5_obj_refs(['Fit'], h5_loop_refs)[0] h5_loop_guess = get_h5_obj_refs(['Guess'], h5_loop_refs)[0] h5_loop_spec_inds = get_h5_obj_refs(['Spectroscopic_Indices'], h5_loop_refs)[0] h5_loop_spec_vals = get_h5_obj_refs(['Spectroscopic_Values'], h5_loop_refs)[0] # Link the Position and Spectroscopic datasets to the Loop Guess and Fit link_as_main(h5_loop_fit, h5_pos_inds, h5_pos_vals, h5_loop_spec_inds, h5_loop_spec_vals) link_as_main(h5_loop_guess, h5_pos_inds, h5_pos_vals, h5_loop_spec_inds, h5_loop_spec_vals) self.h5_raw = USIDataset(h5_raw) self.h5_sho_guess = USIDataset(h5_sho_guess) self.h5_sho_fit = USIDataset(h5_sho_fit) self.h5_loop_guess = USIDataset(h5_loop_guess) self.h5_loop_fit = USIDataset(h5_loop_fit) self.h5_spec_vals = h5_spec_vals self.h5_spec_inds = h5_spec_inds self.h5_sho_spec_inds = h5_sho_spec_inds self.h5_sho_spec_vals = h5_sho_spec_vals self.h5_loop_spec_inds = h5_loop_spec_inds self.h5_loop_spec_vals = h5_loop_spec_vals self.h5_file = h5_raw.file return
def _get_sho_chunk_sizes(self, max_mem_mb): """ Calculates the largest number of positions that can be read into memory for a single FORC cycle Parameters ---------- max_mem_mb : unsigned int Maximum allowable memory in megabytes verbose : Boolean (Optional. Default is False) Whether or not to print debugging statements Returns ------- max_pos : unsigned int largest number of positions that can be read into memory for a single FORC cycle sho_spec_inds_per_forc : unsigned int Number of indices in the SHO spectroscopic table that will be used per read metrics_spec_inds_per_forc : unsigned int Number of indices in the Loop metrics spectroscopic table that will be used per read """ # Step 1: Find number of FORC cycles and repeats (if any), DC steps, and number of loops # dc_offset_index = np.argwhere(self._sho_spec_inds.attrs['labels'] == 'DC_Offset').squeeze() num_dc_steps = np.unique(self._sho_spec_inds[self._fit_spec_index, :]).size all_spec_dims = list(range(self._sho_spec_inds.shape[0])) all_spec_dims.remove(self._fit_spec_index) # Remove FORC_cycles sho_spec_labels = self.h5_main.spec_dim_labels has_forcs = 'FORC' in sho_spec_labels or 'FORC_Cycle' in sho_spec_labels if has_forcs: forc_name = 'FORC' if 'FORC' in sho_spec_labels else 'FORC_Cycle' try: forc_pos = sho_spec_labels.index(forc_name) except Exception: raise # forc_pos = np.argwhere(sho_spec_labels == forc_name)[0][0] self._num_forcs = np.unique(self._sho_spec_inds[forc_pos]).size all_spec_dims.remove(forc_pos) # Remove FORC_repeats has_forc_repeats = 'FORC_repeat' in sho_spec_labels if has_forc_repeats: try: forc_repeat_pos = sho_spec_labels.index('FORC_repeat') except Exception: raise # forc_repeat_pos = np.argwhere(sho_spec_labels == 'FORC_repeat')[0][0] self._num_forc_repeats = np.unique(self._sho_spec_inds[forc_repeat_pos]).size all_spec_dims.remove(forc_repeat_pos) # calculate number of loops: if len(all_spec_dims) == 0: loop_dims = 1 else: loop_dims = get_dimensionality(self._sho_spec_inds, all_spec_dims) loops_per_forc = np.product(loop_dims) # Step 2: Calculate the largest number of FORCS and positions that can be read given memory limits: size_per_forc = num_dc_steps * loops_per_forc * len(self.h5_main.dtype) * self.h5_main.dtype[0].itemsize """ How we arrive at the number for the overhead (how many times the size of the data-chunk we will use in memory) 1 for the original data, 1 for data copied to all children processes, 1 for results, 0.5 for fit, guess, misc """ mem_overhead = 3.5 max_pos = int(max_mem_mb * 1024 ** 2 / (size_per_forc * mem_overhead)) if self._verbose: print('Can read {} of {} pixels given a {} MB memory limit'.format(max_pos, self._sho_pos_inds.shape[0], max_mem_mb)) self.max_pos = int(min(self._sho_pos_inds.shape[0], max_pos)) self.sho_spec_inds_per_forc = int(self._sho_spec_inds.shape[1] / self._num_forcs / self._num_forc_repeats) self.metrics_spec_inds_per_forc = int(self._met_spec_inds.shape[1] / self._num_forcs / self._num_forc_repeats) # Step 3: Read allowed chunk self._sho_all_but_forc_inds = list(range(self._sho_spec_inds.shape[0])) self._met_all_but_forc_inds = list(range(self._met_spec_inds.shape[0])) if self._num_forcs > 1: self._sho_all_but_forc_inds.remove(forc_pos) met_forc_pos = np.argwhere(get_attr(self._met_spec_inds, 'labels') == forc_name)[0][0] self._met_all_but_forc_inds.remove(met_forc_pos) if self._num_forc_repeats > 1: self._sho_all_but_forc_inds.remove(forc_repeat_pos) met_forc_repeat_pos = np.argwhere(get_attr(self._met_spec_inds, 'labels') == 'FORC_repeat')[0][0] self._met_all_but_forc_inds.remove(met_forc_repeat_pos) return
def plot_cluster_h5_group(h5_group, labels_kwargs=None, centroids_kwargs=None): """ Plots the cluster labels and mean response for each cluster Parameters ---------- h5_group : h5py.Datagroup object H5 group containing the labels and mean response labels_kwargs : dict, optional keyword arguments for the labels plot. NOT enabled yet. centroids_kwargs : dict, optional keyword arguments for the centroids plot. NOT enabled yet. Returns ------- fig_labels : figure handle Figure containing the labels fig_centroids : figure handle Figure containing the centroids """ if not isinstance(h5_group, h5py.Group): raise TypeError('h5_group should be a h5py.Group') h5_labels = USIDataset(h5_group['Labels']) h5_centroids = USIDataset(h5_group['Mean_Response']) labels_mat = np.squeeze(h5_labels.get_n_dim_form()) if labels_mat.ndim > 3: print('Unable to visualize 4 or more dimensional labels!') if labels_mat.ndim == 1: fig_labs, axis_labs = plt.subplots(figsize=(5.5, 5)) axis_labs.plot(h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]), labels_mat) axis_labs.set_xlabel(h5_labels.pos_dim_descriptors[0]) axis_labs.set_ylabel('Cluster index') axis_labs.set_title(get_attr(h5_group, 'cluster_algorithm') + ' Labels') elif labels_mat.ndim == 2: fig_labs, axis_labs = plot_cluster_labels(labels_mat, num_clusters=h5_centroids.shape[0], x_label=h5_labels.pos_dim_descriptors[0], y_label=h5_labels.pos_dim_descriptors[1], x_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]), y_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[1]), title=get_attr(h5_group, 'cluster_algorithm') + ' Labels') # TODO: probably not a great idea to load the entire dataset to memory centroids_mat = h5_centroids.get_n_dim_form() if len(h5_centroids.spec_dim_labels) == 1: legend_mode = 2 if h5_centroids.shape[0] < 6: legend_mode = 1 fig_cent, axis_cent = plot_cluster_centroids(centroids_mat, h5_centroids.get_spec_values(h5_centroids.spec_dim_labels[0]), legend_mode=legend_mode, x_label=h5_centroids.spec_dim_descriptors[0], y_label=h5_centroids.data_descriptor, overlayed=h5_centroids.shape[0] < 6, title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid', amp_units=get_attr(h5_centroids, 'units')) elif len(h5_centroids.spec_dim_labels) == 2: # stack of spectrograms if h5_centroids.dtype in [np.complex64, np.complex128, np.complex]: fig_cent, axis_cent = plot_complex_spectra(centroids_mat, subtitle_prefix='Cluster', title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid', x_label=h5_centroids.spec_dim_descriptors[0], y_label=h5_centroids.spec_dim_descriptors[1], amp_units=get_attr(h5_centroids, 'units')) else: fig_cent, axis_cent = plot_map_stack(centroids_mat, color_bar_mode='each', evenly_spaced=True, title='Cluster', heading=get_attr(h5_group, 'cluster_algorithm') + ' Centroid') return fig_labs, fig_cent
def _check_for_old_fit(self): """ Returns three lists of h5py.Dataset objects where the group contained: 1. Completed guess only 2. Partial Fit 3. Completed Fit Returns ------- """ # First find all groups that match the basic condition of matching tool name all_groups = find_results_groups(self.h5_main, self._fitter_name) if self._verbose: print('Groups that matched the nomenclature: {}'.format(all_groups)) # Next sort these groups into three categories: completed_guess = [] partial_fits = [] completed_fits = [] for h5_group in all_groups: if 'Fit' in h5_group.keys(): # check group for fit attribute h5_fit = h5_group['Fit'] # check Fit dataset against parms_dict if not check_for_matching_attrs(h5_fit, new_parms=self._parms_dict, verbose=self._verbose): if self._verbose: print('{} did not match the given parameters'.format(h5_fit.name)) continue # sort this dataset: try: last_pix = get_attr(h5_fit, 'last_pixel') except KeyError: last_pix = None # For now skip any fits that are missing 'last_pixel' if last_pix is None: continue elif last_pix < self.h5_main.shape[0]: partial_fits.append(h5_fit.parent) else: completed_fits.append(h5_fit) else: if 'Guess' in h5_group.keys(): h5_guess = h5_group['Guess'] # sort this dataset: try: last_pix = get_attr(h5_guess, 'last_pixel') except KeyError: last_pix = None # For now skip any fits that are missing 'last_pixel' if last_pix is None: continue elif last_pix == self.h5_main.shape[0]: if self._verbose: print('{} was a completed Guess'.format(h5_guess.name)) completed_guess.append(h5_guess) else: if self._verbose: print('{} did not not have completed Guesses'.format(h5_guess.name)) else: if self._verbose: print('{} did not even have Guess. Categorizing as defective Group'.format(h5_group.name)) return completed_guess, partial_fits, completed_fits