def test(self, override=False): """ Decomposes the hdf5 dataset to calculate the components and projection. This function does NOT write results to the hdf5 file. Call :meth:`~pycroscopy.processing.Decomposition.compute()` to write to the file. Handles complex, compound datasets such that the components are of the same data-type as the input matrix. Parameters ---------- override : bool, optional. default = False Set to true to recompute results if prior results are available. Else, returns existing results Returns ------- components : :class:`numpy.ndarray` Components projections : :class:`numpy.ndarray` Projections """ if not override: if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0: self.h5_results_grp = self.duplicate_h5_groups[-1] print('Returning previously computed results from: {}'.format(self.h5_results_grp.name)) print('set the "override" flag to True to recompute results') return USIDataset(self.h5_results_grp['Components']).get_n_dim_form(), \ USIDataset(self.h5_results_grp['Projection']).get_n_dim_form() self.h5_results_grp = None print('Performing Decomposition on {}.'.format(self.h5_main.name)) t0 = time.time() self._fit() self._transform() print('Took {} to compute {}'.format(format_time(time.time() - t0), self.method_name)) self.__components = stack_real_to_target_dtype(self.estimator.components_, self.h5_main.dtype) projection_mat, success = reshape_to_n_dims(self.__projection, h5_pos=self.h5_main.h5_pos_inds, h5_spec=np.expand_dims(np.arange(self.__projection.shape[1]), axis=0)) if not success: raise ValueError('Could not reshape projections to N-Dimensional dataset! Error:' + success) components_mat, success = reshape_to_n_dims(self.__components, h5_spec=self.h5_main.h5_spec_inds, h5_pos=np.expand_dims(np.arange(self.__components.shape[0]), axis=1)) if not success: raise ValueError('Could not reshape components to N-Dimensional dataset! Error:' + success) return components_mat, projection_mat
def __init__(self, h5_main, num_components=None): super(SVD, self).__init__(h5_main) self.process_name = 'SVD' ''' Calculate the size of the main data in memory and compare to max_mem We use the minimum of the actual dtype's itemsize and float32 since we don't want to read it in yet and do the proper type conversions. ''' n_samples, n_features = h5_main.shape self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype( h5_main) if num_components is None: num_components = min(n_samples, n_features) else: num_components = min(n_samples, n_features, num_components) self.num_components = num_components self.parms_dict = {'num_components': num_components} self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates( ) # supercharge h5_main! self.h5_main = USIDataset(self.h5_main) self.__u = None self.__v = None self.__s = None
def test_check_for_old_guess_incomplete(self): self.fitter._fitter_name = 'Fitter' # Set last_pixel to less than number of positions write_simple_attrs(self.h5_guess, {'last_pixel': np.random.randint(self.h5_guess.shape[0]-1)}) partial, completed = self.fitter._check_for_old_guess() self.assertEqual(USIDataset(partial[0]), self.h5_guess) self.assertEqual(completed, [])
def test_check_for_old_guess_complete(self): self.fitter._fitter_name = 'Fitter' # Set last_pixel to number of positions write_simple_attrs(self.h5_guess, {'last_pixel': self.h5_guess.shape[0]}) partial, completed = self.fitter._check_for_old_guess() self.assertEqual(partial, []) self.assertEqual(USIDataset(completed[0]), self.h5_guess)
def __init__(self, h5_main, estimator, **kwargs): """ Constructs the Decomposition object. Call the :meth:`~pycroscopy.processing.Decomposition.test()` and :meth:`~pycroscopy.processing.Decomposition.compute()` methods to run the decomposition Parameters ------------ h5_main : :class:`pyUSID.USIDataset` object USID Main HDF5 dataset with embedded ancillary spectroscopic, position indices and values datasets estimator : :module:`sklearn.decomposition` object configured decomposition object to apply to the data h5_target_group : h5py.Group, optional. Default = None Location where to look for existing results and to place newly computed results. Use this kwarg if the results need to be written to a different HDF5 file. By default, this value is set to the parent group containing `h5_main` """ allowed_methods = [dec.factor_analysis.FactorAnalysis, dec.fastica_.FastICA, dec.incremental_pca.IncrementalPCA, dec.sparse_pca.MiniBatchSparsePCA, dec.nmf.NMF, dec.pca.PCA, dec.sparse_pca.SparsePCA, dec.truncated_svd.TruncatedSVD] # Store the decomposition object self.estimator = estimator # could not find a nicer way to extract the method name yet self.method_name = str(estimator)[:str(estimator).index('(')] if type(estimator) not in allowed_methods: raise NotImplementedError('Cannot work with {} yet'.format(self.method_name)) # Done with decomposition-related checks, now call super init super(Decomposition, self).__init__(h5_main, 'Decomposition', **kwargs) # set up parameters self.parms_dict = {'decomposition_algorithm':self.method_name} self.parms_dict.update(self.estimator.get_params()) # check for existing datagroups with same results # Partial groups don't make any sense for statistical learning algorithms.... self.duplicate_h5_groups, self.h5_partial_groups = self._check_for_duplicates() # figure out the operation that needs need to be performed to convert to real scalar (self.data_transform_func, self.data_is_complex, self.data_is_compound, self.data_n_features, self.data_type_mult) = check_dtype(h5_main) # supercharge h5_main! self.h5_main = USIDataset(self.h5_main) self.__components = None self.__projection = None
def __init__(self, h5_main, variables=['Frequency'], parallel=True, verbose=False): """ For now, we assume that the guess dataset has not been generated for this dataset but we will relax this requirement after testing the basic components. Parameters ---------- h5_main : h5py.Dataset instance The dataset over which the analysis will be performed. This dataset should be linked to the spectroscopic indices and values, and position indices and values datasets. variables : list(string), Default ['Frequency'] Lists of attributes that h5_main should possess so that it may be analyzed by Model. parallel : bool, optional Should the parallel implementation of the fitting be used. Default True verbose : bool, optional. default = False Whether or not to print statements that aid in debugging """ if not isinstance(h5_main, USIDataset): h5_main = USIDataset(h5_main) # Checking if dataset has the proper dimensions for the model to run. if self._is_legal(h5_main, variables): self.h5_main = h5_main else: raise ValueError( 'Provided dataset is not a "Main" dataset with necessary ancillary datasets' ) # Checking if parallel processing will be used self._parallel = parallel self._verbose = verbose # Determining the max size of the data that can be put into memory self._set_memory_and_cores() self._start_pos = 0 self._end_pos = self.h5_main.shape[0] self.h5_guess = None self.h5_fit = None self.h5_results_grp = None # TODO: do NOT expose a lot of innards. Turn it into private with _var_name self.data = None self.guess = None self.fit = None self._fitter_name = None # Reset this in the extended classes self._parms_dict = dict()
def __init__(self, h5_main, estimator): """ Uses the provided (preconfigured) Decomposition object to decompose the provided dataset Parameters ------------ h5_main : HDF5 dataset object Main dataset with ancillary spectroscopic, position indices and values datasets estimator : sklearn.cluster estimator object configured decomposition object to apply to the data """ allowed_methods = [dec.factor_analysis.FactorAnalysis, dec.fastica_.FastICA, dec.incremental_pca.IncrementalPCA, dec.sparse_pca.MiniBatchSparsePCA, dec.nmf.NMF, dec.pca.PCA, dec.sparse_pca.SparsePCA, dec.truncated_svd.TruncatedSVD] # Store the decomposition object self.estimator = estimator # could not find a nicer way to extract the method name yet self.method_name = str(estimator)[:str(estimator).index('(')] if type(estimator) not in allowed_methods: raise NotImplementedError('Cannot work with {} yet'.format(self.method_name)) # Done with decomposition-related checks, now call super init super(Decomposition, self).__init__(h5_main) # set up parameters self.parms_dict = {'decomposition_algorithm':self.method_name} self.parms_dict.update(self.estimator.get_params()) # check for existing datagroups with same results self.process_name = 'Decomposition' # Partial groups don't make any sense for statistical learning algorithms.... self.duplicate_h5_groups, self.h5_partial_groups = self._check_for_duplicates() # figure out the operation that needs need to be performed to convert to real scalar (self.data_transform_func, self.data_is_complex, self.data_is_compound, self.data_n_features, self.data_type_mult) = check_dtype(h5_main) # supercharge h5_main! self.h5_main = USIDataset(self.h5_main) self.__components = None self.__projection = None
def __init__(self, h5_main, num_components=None, **kwargs): """ Perform the SVD decomposition on the selected dataset and write the results to h5 file. h5_target_group : h5py.Group, optional. Default = None Location where to look for existing results and to place newly computed results. Use this kwarg if the results need to be written to a different HDF5 file. By default, this value is set to the parent group containing `h5_main` :param h5_main: USID Main HDF5 dataset that will be decomposed :type h5_main: :class:`pyUSID.USIDataset` object :param num_components: Number of components to decompose h5_main into. Default None. :type num_components: int, optional :param kwargs: Arguments to be sent to Process :type kwargs: """ super(SVD, self).__init__(h5_main, 'SVD', **kwargs) ''' Calculate the size of the main data in memory and compare to max_mem We use the minimum of the actual dtype's itemsize and float32 since we don't want to read it in yet and do the proper type conversions. ''' n_samples, n_features = h5_main.shape self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype( h5_main) if num_components is None: num_components = min(n_samples, n_features) else: num_components = min(n_samples, n_features, num_components) self.num_components = num_components # Check that we can actually compute the SVD with the selected number of components self._check_available_mem() self.parms_dict = {'num_components': num_components} self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates( ) # supercharge h5_main! self.h5_main = USIDataset(self.h5_main) self.__u = None self.__v = None self.__s = None
def __init__(self, h5_main, num_components=None, **kwargs): """ Perform the SVD decomposition on the selected dataset and write the results to h5 file. Parameters ---------- h5_main : USIDataset Dataset to be decomposed. num_components : int, optional Number of components to decompose h5_main into. Default None. kwargs Arguments to be sent to Process """ super(SVD, self).__init__(h5_main, **kwargs) self.process_name = 'SVD' ''' Calculate the size of the main data in memory and compare to max_mem We use the minimum of the actual dtype's itemsize and float32 since we don't want to read it in yet and do the proper type conversions. ''' n_samples, n_features = h5_main.shape self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype( h5_main) if num_components is None: num_components = min(n_samples, n_features) else: num_components = min(n_samples, n_features, num_components) self.num_components = num_components # Check that we can actually compute the SVD with the selected number of components self._check_available_mem() self.parms_dict = {'num_components': num_components} self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates( ) # supercharge h5_main! self.h5_main = USIDataset(self.h5_main) self.__u = None self.__v = None self.__s = None
def _create_fit_datasets(self): """ Creates the HDF5 fit dataset. pycroscopy requires that the h5 group, guess dataset, corresponding spectroscopic and position datasets be created and populated at this point. This function will create the HDF5 dataset for the fit and link it to same ancillary datasets as the guess. The fit dataset will NOT be populated here but will instead be populated using the __setData function """ if self.h5_guess is None: warn('Need to guess before fitting!') return if self.step_start_inds is None: h5_spec_inds = self.h5_main.h5_spec_inds self.step_start_inds = np.where(h5_spec_inds[0] == 0)[0] if self.num_udvs_steps is None: self.num_udvs_steps = len(self.step_start_inds) if self.freq_vec is None: self._get_frequency_vector() h5_sho_grp = self.h5_guess.parent write_simple_attrs(h5_sho_grp, {'SHO_fit_method': "pycroscopy BESHO"}) # Create the fit dataset as an empty dataset of the same size and dtype as the guess. # Also automatically links in the ancillary datasets. self.h5_fit = USIDataset( create_empty_dataset(self.h5_guess, dtype=sho32, dset_name='Fit')) # This is necessary comparing against new runs to avoid re-computation + resuming partial computation write_simple_attrs(self.h5_fit, self._parms_dict) write_simple_attrs(self.h5_fit, { 'SHO_fit_method': "pycroscopy BESHO", 'last_pixel': 0 }) self.h5_fit.file.flush()
def rebuild_svd(h5_main, components=None, cores=None, max_RAM_mb=1024): """ Rebuild the Image from the SVD results on the windows Optionally, only use components less than n_comp. :param h5_main: dataset which SVD was performed on :type h5_main: hdf5 Dataset :param components: Defines which components to keep Default - None, all components kept Input Types integer : Components less than the input will be kept length 2 iterable of integers : Integers define start and stop of component slice to retain other iterable of integers or slice : Selection of component indices to retain :type components: {int, iterable of int, slice} optional :param cores: How many cores should be used to rebuild Default - None, all but 2 cores will be used, min 1 :type cores: int, optional :param max_RAM_mb: Maximum ammount of memory to use when rebuilding, in Mb. Default - 1024Mb :type max_RAM_mb: int, optional :raise: KeyError if SVD results not found :returns: rebu dataset :rtype: HDF5 Dataset """ if not isinstance(h5_main, USIDataset): h5_main = USIDataset(h5_main) comp_slice, num_comps = get_component_slice( components, total_components=h5_main.shape[1]) if isinstance(comp_slice, np.ndarray): comp_slice = list(comp_slice) dset_name = h5_main.name.split('/')[-1] # Ensuring that at least one core is available for use / 2 cores are available for other use max_cores = max(1, cpu_count() - 2) # print('max_cores',max_cores) if cores is not None: cores = min(round(abs(cores)), max_cores) else: cores = max_cores max_memory = min(max_RAM_mb * 1024**2, 0.75 * get_available_memory()) if cores != 1: max_memory = int(max_memory / 2) ''' Get the handles for the SVD results ''' try: h5_svd_group = find_results_groups(h5_main, 'SVD')[-1] h5_S = h5_svd_group['S'] h5_U = h5_svd_group['U'] h5_V = h5_svd_group['V'] except KeyError: raise KeyError( 'SVD Results for {dset} were not found.'.format(dset=dset_name)) except: raise func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_V) ''' Calculate the size of a single batch that will fit in the available memory ''' n_comps = h5_S[comp_slice].size mem_per_pix = (h5_U.dtype.itemsize + h5_V.dtype.itemsize * h5_V.shape[1]) * n_comps fixed_mem = h5_main.size * h5_main.dtype.itemsize if cores is None: free_mem = max_memory - fixed_mem else: free_mem = max_memory * 2 - fixed_mem batch_size = int(round(float(free_mem) / mem_per_pix)) if batch_size < 0: print('Batches listed were negative', batch_size) batch_size = 100 batch_slices = gen_batches(h5_U.shape[0], batch_size) print('Reconstructing in batches of {} positions.'.format(batch_size)) print('Batches should be {} Mb each.'.format(mem_per_pix * batch_size / 1024.0**2)) ''' Loop over all batches. ''' ds_V = np.dot(np.diag(h5_S[comp_slice]), func(h5_V[comp_slice, :])) rebuild = np.zeros((h5_main.shape[0], ds_V.shape[1])) for ibatch, batch in enumerate(batch_slices): rebuild[batch, :] += np.dot(h5_U[batch, comp_slice], ds_V) rebuild = stack_real_to_target_dtype(rebuild, h5_V.dtype) print( 'Completed reconstruction of data from SVD results. Writing to file.') ''' Create the Group and dataset to hold the rebuild data ''' rebuilt_grp = create_indexed_group(h5_svd_group, 'Rebuilt_Data') h5_rebuilt = write_main_dataset(rebuilt_grp, rebuild, 'Rebuilt_Data', get_attr(h5_main, 'quantity'), get_attr(h5_main, 'units'), None, None, h5_pos_inds=h5_main.h5_pos_inds, h5_pos_vals=h5_main.h5_pos_vals, h5_spec_inds=h5_main.h5_spec_inds, h5_spec_vals=h5_main.h5_spec_vals, chunks=h5_main.chunks, compression=h5_main.compression) if isinstance(comp_slice, slice): rebuilt_grp.attrs['components_used'] = '{}-{}'.format( comp_slice.start, comp_slice.stop) else: rebuilt_grp.attrs['components_used'] = components copy_attributes(h5_main, h5_rebuilt, skip_refs=False) h5_main.file.flush() print('Done writing reconstructed data to file.') return h5_rebuilt
def write_images(self): if bool(self.img_desc): for img_f, descriptors in self.img_desc.items(): #check for existing spectrogram or image and link position/spec inds/vals #at most two channels worth of need to be checked try: str_main = str( usid.hdf_utils.get_all_main( self.h5_f['Measurement_000/Channel_000'])) i_beg = str_main.find('located at: \n\t') + 14 i_end = str_main.find('\nData contains') - 1 data_loc = str_main[i_beg:i_end] channel_data = USIDataset(self.h5_f[data_loc]) h5_pos_inds = channel_data.h5_pos_inds h5_pos_vals = channel_data.h5_pos_vals pos_dims = None write_pos_vals = False if channel_data.spec_dim_sizes[0] == 1: h5_spec_inds = channel_data.h5_spec_inds h5_spec_vals = channel_data.h5_spec_vals spec_dims = None #if channel 000 is spectrogram, check next dataset elif channel_data.spec_dim_sizes[0] != 1: str_main = str( usid.hdf_utils.get_all_main( self.h5_f['Measurement_000/Channel_001'])) i_beg = str_main.find('located at: \n\t') + 14 i_end = str_main.find('\nData contains') - 1 data_loc = str_main[i_beg:i_end] channel_data = USIDataset(self.h5_f[data_loc]) #channel data is an image, & we link their spec inds/vals if channel_data.spec_dim_sizes[0] == 1: h5_spec_inds = channel_data.h5_spec_inds h5_spec_vals = channel_data.h5_spec_vals spec_dims = None #in case where channel does not exist, we make new spec/pos inds/vals except KeyError: #pos dims h5_pos_inds = None h5_pos_vals = None pos_dims = self.pos_dims write_pos_vals = True #spec dims h5_spec_inds = None h5_spec_vals = None spec_dims = usid.write_utils.Dimension('arb', 'a.u', 1) channel_i = usid.hdf_utils.create_indexed_group( self.h5_meas_grp, 'Channel_') h5_raw = usid.hdf_utils.write_main_dataset( channel_i, #parent HDF5 group (self.x_len * self.y_len, 1), # shape of Main dataset 'Raw_' + descriptors[0].replace('-', '_'), # Name of main dataset descriptors[0], # Physical quantity contained in Main dataset descriptors[2], # Units for the physical quantity h5_pos_inds=h5_pos_inds, h5_pos_vals=h5_pos_vals, # Position dimensions pos_dims=pos_dims, # Spectroscopic dimensions h5_spec_inds=h5_spec_inds, h5_spec_vals=h5_spec_vals, spec_dims=spec_dims, dtype=np.float32, # data type / precision main_dset_attrs={ 'Caption': descriptors[0], 'Scale': descriptors[1], 'Physical_Units': descriptors[2], 'Offset': descriptors[3] }) h5_raw[:, :] = self.imgs[img_f].reshape(h5_raw.shape) if write_pos_vals: h5_raw.h5_pos_vals[:, :] = self.pos_val
def _setup_h5(self, data_gen_parms): """ Setups up the hdf5 file structure before doing the actual generation Parameters ---------- data_gen_parms : dict Dictionary containing the parameters to write to the Measurement Group as attributes Returns ------- """ ''' Build the group structure down to the channel group ''' # Set up the basic group structure root_grp = VirtualGroup('') root_parms = generate_dummy_main_parms() root_parms['translator'] = 'FAKEBEPS' root_parms['data_type'] = data_gen_parms['data_type'] root_grp.attrs = root_parms meas_grp = VirtualGroup('Measurement_') chan_grp = VirtualGroup('Channel_') meas_grp.attrs.update(data_gen_parms) # Create the Position and Spectroscopic datasets for the Raw Data ds_pos_inds, ds_pos_vals, ds_spec_inds, ds_spec_vals = self._build_ancillary_datasets( ) raw_chunking = calc_chunks([self.n_pixels, self.n_spec_bins], np.complex64(0).itemsize, unit_chunks=[1, self.n_bins]) ds_raw_data = VirtualDataset( 'Raw_Data', data=None, maxshape=[self.n_pixels, self.n_spec_bins], dtype=np.complex64, compression='gzip', chunking=raw_chunking, parent=meas_grp) chan_grp.add_children([ ds_pos_inds, ds_pos_vals, ds_spec_inds, ds_spec_vals, ds_raw_data ]) meas_grp.add_children([chan_grp]) root_grp.add_children([meas_grp]) hdf = HDFwriter(self.h5_path) hdf.delete() h5_refs = hdf.write(root_grp) # Delete the MicroDatasets to save memory del ds_raw_data, ds_spec_inds, ds_spec_vals, ds_pos_inds, ds_pos_vals # Get the file and Raw_Data objects h5_raw = get_h5_obj_refs(['Raw_Data'], h5_refs)[0] h5_chan_grp = h5_raw.parent # Get the Position and Spectroscopic dataset objects h5_pos_inds = get_h5_obj_refs(['Position_Indices'], h5_refs)[0] h5_pos_vals = get_h5_obj_refs(['Position_Values'], h5_refs)[0] h5_spec_inds = get_h5_obj_refs(['Spectroscopic_Indices'], h5_refs)[0] h5_spec_vals = get_h5_obj_refs(['Spectroscopic_Values'], h5_refs)[0] # Link the Position and Spectroscopic datasets as attributes of Raw_Data link_as_main(h5_raw, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals) ''' Build the SHO Group ''' sho_grp = VirtualGroup('Raw_Data-SHO_Fit_', parent=h5_chan_grp.name) # Build the Spectroscopic datasets for the SHO Guess and Fit sho_spec_starts = np.where( h5_spec_inds[h5_spec_inds.attrs['Frequency']].squeeze() == 0)[0] sho_spec_labs = get_attr(h5_spec_inds, 'labels') ds_sho_spec_inds, ds_sho_spec_vals = build_reduced_spec_dsets( h5_spec_inds, h5_spec_vals, keep_dim=sho_spec_labs != 'Frequency', step_starts=sho_spec_starts) sho_chunking = calc_chunks([self.n_pixels, self.n_sho_bins], sho32.itemsize, unit_chunks=[1, 1]) ds_sho_fit = VirtualDataset('Fit', data=None, maxshape=[self.n_pixels, self.n_sho_bins], dtype=sho32, compression='gzip', chunking=sho_chunking, parent=sho_grp) ds_sho_guess = VirtualDataset( 'Guess', data=None, maxshape=[self.n_pixels, self.n_sho_bins], dtype=sho32, compression='gzip', chunking=sho_chunking, parent=sho_grp) sho_grp.add_children( [ds_sho_fit, ds_sho_guess, ds_sho_spec_inds, ds_sho_spec_vals]) # Write the SHO group and datasets to the file and delete the MicroDataset objects h5_sho_refs = hdf.write(sho_grp) del ds_sho_fit, ds_sho_guess, ds_sho_spec_inds, ds_sho_spec_vals # Get the dataset handles for the fit and guess h5_sho_fit = get_h5_obj_refs(['Fit'], h5_sho_refs)[0] h5_sho_guess = get_h5_obj_refs(['Guess'], h5_sho_refs)[0] # Get the dataset handles for the SHO Spectroscopic datasets h5_sho_spec_inds = get_h5_obj_refs(['Spectroscopic_Indices'], h5_sho_refs)[0] h5_sho_spec_vals = get_h5_obj_refs(['Spectroscopic_Values'], h5_sho_refs)[0] # Link the Position and Spectroscopic datasets as attributes of the SHO Fit and Guess link_as_main(h5_sho_fit, h5_pos_inds, h5_pos_vals, h5_sho_spec_inds, h5_sho_spec_vals) link_as_main(h5_sho_guess, h5_pos_inds, h5_pos_vals, h5_sho_spec_inds, h5_sho_spec_vals) ''' Build the loop group ''' loop_grp = VirtualGroup('Fit-Loop_Fit_', parent=h5_sho_fit.parent.name) # Build the Spectroscopic datasets for the loops loop_spec_starts = np.where(h5_sho_spec_inds[ h5_sho_spec_inds.attrs['DC_Offset']].squeeze() == 0)[0] loop_spec_labs = get_attr(h5_sho_spec_inds, 'labels') ds_loop_spec_inds, ds_loop_spec_vals = build_reduced_spec_dsets( h5_sho_spec_inds, h5_sho_spec_vals, keep_dim=loop_spec_labs != 'DC_Offset', step_starts=loop_spec_starts) # Create the loop fit and guess MicroDatasets loop_chunking = calc_chunks([self.n_pixels, self.n_loops], loop_fit32.itemsize, unit_chunks=[1, 1]) ds_loop_fit = VirtualDataset('Fit', data=None, maxshape=[self.n_pixels, self.n_loops], dtype=loop_fit32, compression='gzip', chunking=loop_chunking, parent=loop_grp) ds_loop_guess = VirtualDataset('Guess', data=None, maxshape=[self.n_pixels, self.n_loops], dtype=loop_fit32, compression='gzip', chunking=loop_chunking, parent=loop_grp) # Add the datasets to the loop group then write it to the file loop_grp.add_children( [ds_loop_fit, ds_loop_guess, ds_loop_spec_inds, ds_loop_spec_vals]) h5_loop_refs = hdf.write(loop_grp) # Delete the MicroDatasets del ds_loop_spec_vals, ds_loop_spec_inds, ds_loop_guess, ds_loop_fit # Get the handles to the datasets h5_loop_fit = get_h5_obj_refs(['Fit'], h5_loop_refs)[0] h5_loop_guess = get_h5_obj_refs(['Guess'], h5_loop_refs)[0] h5_loop_spec_inds = get_h5_obj_refs(['Spectroscopic_Indices'], h5_loop_refs)[0] h5_loop_spec_vals = get_h5_obj_refs(['Spectroscopic_Values'], h5_loop_refs)[0] # Link the Position and Spectroscopic datasets to the Loop Guess and Fit link_as_main(h5_loop_fit, h5_pos_inds, h5_pos_vals, h5_loop_spec_inds, h5_loop_spec_vals) link_as_main(h5_loop_guess, h5_pos_inds, h5_pos_vals, h5_loop_spec_inds, h5_loop_spec_vals) self.h5_raw = USIDataset(h5_raw) self.h5_sho_guess = USIDataset(h5_sho_guess) self.h5_sho_fit = USIDataset(h5_sho_fit) self.h5_loop_guess = USIDataset(h5_loop_guess) self.h5_loop_fit = USIDataset(h5_loop_fit) self.h5_spec_vals = h5_spec_vals self.h5_spec_inds = h5_spec_inds self.h5_sho_spec_inds = h5_sho_spec_inds self.h5_sho_spec_vals = h5_sho_spec_vals self.h5_loop_spec_inds = h5_loop_spec_inds self.h5_loop_spec_vals = h5_loop_spec_vals self.h5_file = h5_raw.file return
def do_guess(self, processors=None, strategy=None, options=dict(), h5_partial_guess=None, override=False): """ Parameters ---------- strategy: string (optional) Default is 'Wavelet_Peaks'. Can be one of ['wavelet_peaks', 'relative_maximum', 'gaussian_processes']. For updated list, run GuessMethods.methods processors : int (optional) Number of cores to use for computing. Default = all available - 2 cores options: dict Default, options for wavelet_peaks {"peaks_widths": np.array([10,200]), "peak_step":20}. Dictionary of options passed to strategy. For more info see GuessMethods documentation. h5_partial_guess : h5py.group. optional, default = None Datagroup containing (partially computed) guess results. do_guess will resume computation if provided. override : bool, optional. default = False By default, will simply return duplicate results to avoid recomputing or resume computation on a group with partial results. Set to True to force fresh computation. Returns ------- h5_guess : h5py.Dataset Dataset containing guesses that can be passed on to do_fit() """ gm = GuessMethods() if strategy not in gm.methods: raise KeyError( 'Error: %s is not implemented in pycroscopy.analysis.GuessMethods to find guesses' % strategy) # ################## CHECK FOR DUPLICATES AND RESUME PARTIAL ####################################### # Prepare the parms dict that will be used for comparison: self._parms_dict = options.copy() self._parms_dict.update({'strategy': strategy}) # check for old: partial_dsets, completed_dsets = self._check_for_old_guess() if len(completed_dsets) == 0 and len(partial_dsets) == 0: print('No existing datasets found') override = True if not override: # First try to simply return any completed computation if len(completed_dsets) > 0: print('Returned previously computed results at ' + completed_dsets[-1].name) self.h5_guess = USIDataset(completed_dsets[-1]) return # Next attempt to resume automatically if nothing is provided if len(partial_dsets) > 0: # attempt to use whatever the user provided (if legal) target_partial_dset = partial_dsets[-1] if h5_partial_guess is not None: if not isinstance(h5_partial_guess, h5py.Dataset): raise ValueError( 'Provided parameter is not an h5py.Dataset object') if h5_partial_guess not in partial_dsets: raise ValueError( 'Provided dataset for partial Guesses is not compatible' ) if self._verbose: print('Provided partial Guess dataset was acceptable') target_partial_dset = h5_partial_guess # Finally resume from this dataset print('Resuming computation in group: ' + target_partial_dset.name) self.h5_guess = target_partial_dset self._start_pos = target_partial_dset.attrs['last_pixel'] # No completed / partials available or forced via override: if self.h5_guess is None: if self._verbose: print('Starting a fresh computation!') self._start_pos = 0 self._create_guess_datasets() # ################## BEGIN THE ACTUAL COMPUTING ####################################### if processors is None: processors = self._maxCpus else: processors = min(int(processors), self._maxCpus) processors = recommend_cpu_cores(self._max_pos_per_read, processors, verbose=self._verbose) print("Using %s to find guesses...\n" % strategy) time_per_pix = 0 num_pos = self.h5_main.shape[0] - self._start_pos orig_start_pos = self._start_pos print( 'You can abort this computation at any time and resume at a later time!\n' '\tIf you are operating in a python console, press Ctrl+C or Cmd+C to abort\n' '\tIf you are in a Jupyter notebook, click on "Kernel">>"Interrupt"\n' ) self._get_data_chunk() while self.data is not None: t_start = tm.time() opt = Optimize(data=self.data, parallel=self._parallel) temp = opt.computeGuess(processors=processors, strategy=strategy, options=options) # reorder to get one numpy array out temp = self._reformat_results(temp, strategy) self.guess = np.hstack(tuple(temp)) # Write to file self._set_results(is_guess=True) # basic timing logs tot_time = np.round(tm.time() - t_start, decimals=2) # in seconds if self._verbose: print('Done parallel computing in {} or {} per pixel'.format( format_time(tot_time), format_time(tot_time / self.data.shape[0]))) if self._start_pos == orig_start_pos: time_per_pix = tot_time / self._end_pos # in seconds else: time_remaining = (num_pos - self._end_pos) * time_per_pix # in seconds print('Time remaining: ' + format_time(time_remaining)) # get next batch of data self._get_data_chunk() print('Completed computing guess') print() return USIDataset(self.h5_guess)
def plot_cluster_h5_group(h5_group, labels_kwargs=None, centroids_kwargs=None): """ Plots the cluster labels and mean response for each cluster Parameters ---------- h5_group : h5py.Datagroup object H5 group containing the labels and mean response labels_kwargs : dict, optional keyword arguments for the labels plot. NOT enabled yet. centroids_kwargs : dict, optional keyword arguments for the centroids plot. NOT enabled yet. Returns ------- fig_labels : figure handle Figure containing the labels fig_centroids : figure handle Figure containing the centroids """ if not isinstance(h5_group, h5py.Group): raise TypeError('h5_group should be a h5py.Group') h5_labels = USIDataset(h5_group['Labels']) h5_centroids = USIDataset(h5_group['Mean_Response']) labels_mat = np.squeeze(h5_labels.get_n_dim_form()) if labels_mat.ndim > 3: print('Unable to visualize 4 or more dimensional labels!') if labels_mat.ndim == 1: fig_labs, axis_labs = plt.subplots(figsize=(5.5, 5)) axis_labs.plot(h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]), labels_mat) axis_labs.set_xlabel(h5_labels.pos_dim_descriptors[0]) axis_labs.set_ylabel('Cluster index') axis_labs.set_title( get_attr(h5_group, 'cluster_algorithm') + ' Labels') elif labels_mat.ndim == 2: fig_labs, axis_labs = plot_cluster_labels( labels_mat, num_clusters=h5_centroids.shape[0], x_label=h5_labels.pos_dim_descriptors[0], y_label=h5_labels.pos_dim_descriptors[1], x_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]), y_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[1]), title=get_attr(h5_group, 'cluster_algorithm') + ' Labels') # TODO: probably not a great idea to load the entire dataset to memory centroids_mat = h5_centroids.get_n_dim_form() if len(h5_centroids.spec_dim_labels) == 1: legend_mode = 2 if h5_centroids.shape[0] < 6: legend_mode = 1 fig_cent, axis_cent = plot_cluster_centroids( centroids_mat, h5_centroids.get_spec_values(h5_centroids.spec_dim_labels[0]), legend_mode=legend_mode, x_label=h5_centroids.spec_dim_descriptors[0], y_label=h5_centroids.data_descriptor, overlayed=h5_centroids.shape[0] < 6, title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid', amp_units=get_attr(h5_centroids, 'units')) elif len(h5_centroids.spec_dim_labels) == 2: # stack of spectrograms if h5_centroids.dtype in [np.complex64, np.complex128, np.complex]: fig_cent, axis_cent = plot_complex_spectra( centroids_mat, subtitle_prefix='Cluster', title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid', x_label=h5_centroids.spec_dim_descriptors[0], y_label=h5_centroids.spec_dim_descriptors[1], amp_units=get_attr(h5_centroids, 'units')) else: fig_cent, axis_cent = plot_map_stack( centroids_mat, color_bar_mode='each', evenly_spaced=True, title='Cluster', heading=get_attr(h5_group, 'cluster_algorithm') + ' Centroid') return fig_labs, fig_cent
def __init__(self, h5_main, estimator, num_comps=None, **kwargs): """ Constructs the Cluster object. Call the :meth:`~pycroscopy.processing.Cluster.test()` and :meth:`~pycroscopy.processing.Cluster.compute()` methods to run the clustering Parameters ---------- h5_main : :class:`pyUSID.USIDataset` object USID Main HDF5 dataset estimator : :class:`sklearn.cluster` estimator configured clustering algorithm to be applied to the data num_comps : int (unsigned), optional. Default = None / all Number of features / spectroscopic indices to be used to cluster the data h5_target_group : h5py.Group, optional. Default = None Location where to look for existing results and to place newly computed results. Use this kwarg if the results need to be written to a different HDF5 file. By default, this value is set to the parent group containing `h5_main` """ allowed_methods = [ cls.AgglomerativeClustering, cls.Birch, cls.KMeans, cls.MiniBatchKMeans, cls.SpectralClustering ] # could not find a nicer way to extract the method name yet self.method_name = str(estimator)[:str(estimator).index('(')] if type(estimator) not in allowed_methods: raise TypeError('Cannot work with {} just yet'.format( self.method_name)) # Done with decomposition-related checks, now call super init super(Cluster, self).__init__(h5_main, 'Cluster', **kwargs) # Store the decomposition object self.estimator = estimator if num_comps is None: comp_attr = 'all' comp_slice, num_comps = get_component_slice( num_comps, total_components=self.h5_main.shape[1]) self.num_comps = num_comps self.data_slice = (slice(None), comp_slice) if isinstance(comp_slice, slice): # cannot store slice as an attribute in hdf5 # convert to list of integers! inds = comp_slice.indices(self.h5_main.shape[1]) # much like range, inds are arranged as (start, stop, step) if inds[0] == 0 and inds[2] == 1: # starting from 0 with step of 1 = upto N components if inds[1] >= self.h5_main.shape[1] - 1: comp_attr = 'all' else: comp_attr = inds[1] else: comp_attr = range(*inds) elif comp_attr == 'all': pass else: # subset of spectral components specified as an array comp_attr = comp_slice # set up parameters self.parms_dict = { 'cluster_algorithm': self.method_name, 'spectral_components': comp_attr } self.parms_dict.update(self.estimator.get_params()) # update n_jobs according to the cores argument # print('cores reset to', self._cores) # different number of cores should not* be a reason for different results # so we update this flag only after checking for duplicates estimator.n_jobs = self._cores self.parms_dict.update({'n_jobs': self._cores}) # check for existing datagroups with same results # Partial groups don't make any sense for statistical learning algorithms.... self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates( ) # figure out the operation that needs need to be performed to convert to real scalar (self.data_transform_func, self.data_is_complex, self.data_is_compound, self.data_n_features, self.data_type_mult) = check_dtype(h5_main) # supercharge h5_main! self.h5_main = USIDataset(self.h5_main) self.__labels = None self.__mean_resp = None
def reshape_from_lines_to_pixels(h5_main, pts_per_cycle, scan_step_x_m=None): """ Breaks up the provided raw G-mode dataset into lines and pixels (from just lines) Parameters ---------- h5_main : h5py.Dataset object Reference to the main dataset that contains the raw data that is only broken up by lines pts_per_cycle : unsigned int Number of points in a single pixel scan_step_x_m : float Step in meters for pixels Returns ------- h5_resh : h5py.Dataset object Reference to the main dataset that contains the reshaped data """ if not check_if_main(h5_main): raise TypeError('h5_main is not a Main dataset') h5_main = USIDataset(h5_main) if pts_per_cycle % 1 != 0 or pts_per_cycle < 1: raise TypeError('pts_per_cycle should be a positive integer') if scan_step_x_m is not None: if not isinstance(scan_step_x_m, Number): raise TypeError('scan_step_x_m should be a real number') else: scan_step_x_m = 1 if h5_main.shape[1] % pts_per_cycle != 0: warn( 'Error in reshaping the provided dataset to pixels. Check points per pixel' ) raise ValueError num_cols = int(h5_main.shape[1] / pts_per_cycle) # TODO: DO NOT assume simple 1 spectral dimension! single_ao = np.squeeze(h5_main.h5_spec_vals[:, :pts_per_cycle]) spec_dims = Dimension( get_attr(h5_main.h5_spec_vals, 'labels')[0], get_attr(h5_main.h5_spec_vals, 'units')[0], single_ao) # TODO: DO NOT assume simple 1D in positions! pos_dims = [ Dimension('X', 'm', np.linspace(0, scan_step_x_m, num_cols)), Dimension('Y', 'm', np.linspace(0, h5_main.h5_pos_vals[1, 0], h5_main.shape[0])) ] h5_group = create_results_group(h5_main, 'Reshape') # TODO: Create empty datasets and then write for very large datasets h5_resh = write_main_dataset(h5_group, (num_cols * h5_main.shape[0], pts_per_cycle), 'Reshaped_Data', get_attr(h5_main, 'quantity')[0], get_attr(h5_main, 'units')[0], pos_dims, spec_dims, chunks=(10, pts_per_cycle), dtype=h5_main.dtype, compression=h5_main.compression) # TODO: DON'T write in one shot assuming small datasets fit in memory! print('Starting to reshape G-mode line data. Please be patient') h5_resh[()] = np.reshape(h5_main[()], (-1, pts_per_cycle)) print('Finished reshaping G-mode line data to rows and columns') return USIDataset(h5_resh)
def do_fit(self, processors=None, solver_type='least_squares', solver_options=None, obj_func=None, h5_partial_fit=None, h5_guess=None, override=False): """ Generates the fit for the given dataset and writes back to file Parameters ---------- processors : int Number of cpu cores the user wishes to run on. The minimum of this and self._maxCpus is used. solver_type : str The name of the solver in scipy.optimize to use for the fit solver_options : dict Dictionary of parameters to pass to the solver specified by `solver_type` obj_func : dict Dictionary defining the class and method containing the function to be fit as well as any additional function parameters. h5_partial_fit : h5py.group. optional, default = None Datagroup containing (partially computed) fit results. do_fit will resume computation if provided. h5_guess : h5py.group. optional, default = None Datagroup containing guess results. do_fit will use this if provided. override : bool, optional. default = False By default, will simply return duplicate results to avoid recomputing or resume computation on a group with partial results. Set to True to force fresh computation. Returns ------- h5_results : h5py.Dataset object Dataset with the fit parameters """ # ################## PREPARE THE SOLVER ####################################### legit_solver = solver_type in scipy.optimize.__dict__.keys() if not legit_solver: raise KeyError( 'Error: Objective Functions "%s" is not implemented in pycroscopy.analysis.Fit_Methods' % obj_func['obj_func']) obj_func_name = obj_func['obj_func'] legit_obj_func = obj_func_name in Fit_Methods().methods if not legit_obj_func: raise KeyError( 'Error: Solver "%s" does not exist!. For additional info see scipy.optimize\n' % solver_type) # ################## CHECK FOR DUPLICATES AND RESUME PARTIAL ####################################### def _get_group_to_resume(legal_groups, provided_partial_fit): for h5_group in legal_groups: if h5_group['Fit'] == provided_partial_fit: return h5_group return None def _resume_fit(fitter, h5_group): fitter.h5_guess = h5_group['Guess'] fitter.h5_fit = h5_group['Fit'] fitter._start_pos = fitter.h5_fit.attrs['last_pixel'] def _start_fresh_fit(fitter, h5_guess_legal): fitter.h5_guess = h5_guess_legal fitter._create_fit_datasets() fitter._start_pos = 0 # Prepare the parms dict that will be used for comparison: self._parms_dict = solver_options.copy() self._parms_dict.update({'solver_type': solver_type}) self._parms_dict.update(obj_func) completed_guess, partial_fit_groups, completed_fits = self._check_for_old_fit( ) override = override or (h5_partial_fit is not None or h5_guess is not None) if not override: # First try to simply return completed results if len(completed_fits) > 0: print('Returned previously computed results at ' + completed_fits[-1].name) self.h5_fit = USIDataset(completed_fits[-1]) return # Next, attempt to resume automatically: elif len(partial_fit_groups) > 0: print( 'Will resume fitting in {}. ' 'You can supply a dataset using the h5_partial_fit argument' .format(partial_fit_groups[-1].name)) _resume_fit(self, partial_fit_groups[-1]) # Finally, attempt to do fresh fitting using completed Guess: elif len(completed_guess) > 0: print('Will use {} for generating new Fit. ' 'You can supply a dataset using the h5_guess argument'. format(completed_guess[-1].name)) _start_fresh_fit(self, completed_guess[-1]) else: raise ValueError( 'Could not find a compatible Guess to use for Fit. Call do_guess() before do_fit()' ) else: if h5_partial_fit is not None: h5_group = _get_group_to_resume(partial_fit_groups, h5_partial_fit) if h5_group is None: raise ValueError( 'Provided dataset with partial Fit was not found to be compatible' ) _resume_fit(self, h5_group) elif h5_guess is not None: if h5_guess not in completed_guess: raise ValueError( 'Provided dataset with completed Guess was not found to be compatible' ) _start_fresh_fit(self, h5_guess) else: raise ValueError( 'Please provide a completed guess or partially completed Fit to resume' ) # ################## BEGIN THE ACTUAL FITTING ####################################### print("Using solver %s and objective function %s to fit your data\n" % (solver_type, obj_func['obj_func'])) if processors is None: processors = self._maxCpus else: processors = min(processors, self._maxCpus) processors = recommend_cpu_cores(self._max_pos_per_read, processors, verbose=self._verbose) time_per_pix = 0 num_pos = self.h5_main.shape[0] - self._start_pos orig_start_pos = self._start_pos print( 'You can abort this computation at any time and resume at a later time!\n' '\tIf you are operating in a python console, press Ctrl+C or Cmd+C to abort\n' '\tIf you are in a Jupyter notebook, click on "Kernel">>"Interrupt"\n' ) self._get_guess_chunk() self._get_data_chunk() while self.data is not None: t_start = tm.time() opt = Optimize(data=self.data, guess=self.guess, parallel=self._parallel) temp = opt.computeFit(processors=processors, solver_type=solver_type, solver_options=solver_options, obj_func=obj_func.copy()) # TODO: need a different .reformatResults to process fitting results # reorder to get one numpy array out temp = self._reformat_results(temp, obj_func_name) self.fit = np.hstack(tuple(temp)) # Write to file self._set_results(is_guess=False) # basic timing logs tot_time = np.round(tm.time() - t_start, decimals=2) # in seconds if self._verbose: print('Done parallel computing in {} or {} per pixel'.format( format_time(tot_time), format_time(tot_time / self.data.shape[0]))) if self._start_pos == orig_start_pos: time_per_pix = tot_time / self._end_pos # in seconds else: time_remaining = (num_pos - self._end_pos) * time_per_pix # in seconds print('Time remaining: ' + format_time(time_remaining)) # get next batch of data self._get_guess_chunk() self._get_data_chunk() print('Completed computing fit. Writing to file.') return USIDataset(self.h5_fit)
def plot_svd(h5_main, savefig=False, num_plots=16, **kwargs): ''' Replots the SVD showing the skree, abundance maps, and eigenvectors. If h5_main is a Dataset, it will default to the most recent SVD group from that Dataset. If h5_main is the results group, then it will plot the values for that group. :param h5_main: :type h5_main: USIDataset or h5py Dataset or h5py Group :param savefig: Saves the figures to disk with some default names :type savefig: bool, optional :param num_plots: Default number of eigenvectors and abundance plots to show :type num_plots: int :param kwargs: keyword arguments for svd filtering :type kwarrgs: dict, optional ''' if isinstance(h5_main, h5py.Group): _U = find_dataset(h5_main, 'U')[-1] _V = find_dataset(h5_main, 'V')[-1] units = 'arbitrary (a.u.)' h5_spec_vals = np.arange(_V.shape[1]) h5_svd_group = _U.parent else: h5_svd_group = find_results_groups(h5_main, 'SVD')[-1] units = h5_main.attrs['quantity'] h5_spec_vals = h5_main.get_spec_values('Time') h5_U = h5_svd_group['U'] h5_V = h5_svd_group['V'] h5_S = h5_svd_group['S'] _U = USIDataset(h5_U) [num_rows, num_cols] = _U.pos_dim_sizes abun_maps = np.reshape(h5_U[:, :16], (num_rows, num_cols, -1)) eigen_vecs = h5_V[:16, :] skree_sum = np.zeros(h5_S.shape) for i in range(h5_S.shape[0]): skree_sum[i] = np.sum(h5_S[:i]) / np.sum(h5_S) plt.figure() plt.plot(skree_sum, 'bo') plt.title('Cumulative Variance') plt.xlabel('Total Components') plt.ylabel('Total variance ratio (a.u.)') if savefig: plt.savefig('Cumulative_variance_plot.png') fig_skree, axes = plot_utils.plot_scree(h5_S, title='Scree plot') fig_skree.tight_layout() if savefig: plt.savefig('Scree_plot.png') fig_abun, axes = plot_utils.plot_map_stack(abun_maps, num_comps=num_plots, title='SVD Abundance Maps', color_bar_mode='single', cmap='inferno', reverse_dims=True, fig_mult=(3.5, 3.5), facecolor='white', **kwargs) fig_abun.tight_layout() if savefig: plt.savefig('Abundance_maps.png') fig_eigvec, axes = plot_utils.plot_curves(h5_spec_vals * 1e3, eigen_vecs, use_rainbow_plots=False, x_label='Time (ms)', y_label=units, num_plots=num_plots, subtitle_prefix='Component', title='SVD Eigenvectors', evenly_spaced=False, **kwargs) fig_eigvec.tight_layout() if savefig: plt.savefig('Eigenvectors.png') return
def create_empty_dataset(source_dset, dtype, dset_name, h5_group=None, new_attrs=None, skip_refs=False): """ Creates an empty dataset in the h5 file based on the provided dataset in the same or specified group Parameters ---------- source_dset : h5py.Dataset object Source object that provides information on the group and shape of the dataset dtype : dtype Data type of the fit / guess datasets dset_name : String / Unicode Name of the dataset h5_group : h5py.Group object, optional. Default = None Group within which this dataset will be created new_attrs : dictionary (Optional) Any new attributes that need to be written to the dataset skip_refs : boolean, optional Should ObjectReferences and RegionReferences be skipped when copying attributes from the `source_dset` Returns ------- h5_new_dset : h5py.Dataset object Newly created dataset """ import h5py from pyUSID.io.dtype_utils import validate_dtype from pyUSID.io.hdf_utils import copy_attributes, check_if_main, write_book_keeping_attrs from pyUSID import USIDataset import sys if sys.version_info.major == 3: unicode = str if not isinstance(source_dset, h5py.Dataset): raise TypeError('source_deset should be a h5py.Dataset object') _ = validate_dtype(dtype) if new_attrs is not None: if not isinstance(new_attrs, dict): raise TypeError('new_attrs should be a dictionary') else: new_attrs = dict() if h5_group is None: h5_group = source_dset.parent else: if not isinstance(h5_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_group should be a h5py.Group or h5py.File object') if not isinstance(dset_name, (str, unicode)): raise TypeError('dset_name should be a string') dset_name = dset_name.strip() if len(dset_name) == 0: raise ValueError('dset_name cannot be empty!') if '-' in dset_name: warn( 'dset_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(dset_name, dset_name.replace('-', '_'))) dset_name = dset_name.replace('-', '_') if dset_name in h5_group.keys(): if isinstance(h5_group[dset_name], h5py.Dataset): warn('A dataset named: {} already exists in group: {}'.format( dset_name, h5_group.name)) h5_new_dset = h5_group[dset_name] # Make sure it has the correct shape and dtype if any((source_dset.shape != h5_new_dset.shape, dtype != h5_new_dset.dtype)): warn( 'Either the shape (existing: {} desired: {}) or dtype (existing: {} desired: {}) of the dataset ' 'did not match with expectations. Deleting and creating a new one.' .format(h5_new_dset.shape, source_dset.shape, h5_new_dset.dtype, dtype)) del h5_new_dset, h5_group[dset_name] h5_new_dset = h5_group.create_dataset( dset_name, shape=source_dset.shape, dtype=dtype, chunks=source_dset.chunks) else: raise KeyError('{} is already a {} in group: {}'.format( dset_name, type(h5_group[dset_name]), h5_group.name)) else: h5_new_dset = h5_group.create_dataset(dset_name, shape=source_dset.shape, dtype=dtype, chunks=source_dset.chunks) # This should link the ancillary datasets correctly h5_new_dset = copy_attributes(source_dset, h5_new_dset, skip_refs=skip_refs) h5_new_dset.attrs.update(new_attrs) if check_if_main(h5_new_dset): h5_new_dset = USIDataset(h5_new_dset) # update book keeping attributes write_book_keeping_attrs(h5_new_dset) return h5_new_dset
def __init__(self, h5_main, ex_freq, gain, num_x_steps=250, r_extra=110, **kwargs): """ Applies Bayesian Inference to General Mode IV (G-IV) data to extract the true current Parameters ---------- h5_main : h5py.Dataset object Dataset to process ex_freq : float Frequency of the excitation waveform gain : uint Gain setting on current amplifier (typically 7-9) num_x_steps : uint (Optional, default = 250) Number of steps for the inferred results. Note: this may be end up being slightly different from specified. r_extra : float (Optional, default = 110 [Ohms]) Extra resistance in the RC circuit that will provide correct current and resistance values kwargs : dict Other parameters specific to the Process class and nuanced bayesian_inference parameters """ super(GIVBayesian, self).__init__(h5_main, **kwargs) self.gain = gain self.ex_freq = ex_freq self.r_extra = r_extra self.num_x_steps = int(num_x_steps) if self.num_x_steps % 4 == 0: self.num_x_steps = ((self.num_x_steps // 2) + 1) * 2 if self.verbose and self.mpi_rank == 0: print('ensuring that half steps should be odd, num_x_steps is now', self.num_x_steps) self.h5_main = USIDataset(self.h5_main) # take these from kwargs bayesian_parms = { 'gam': 0.03, 'e': 10.0, 'sigma': 10.0, 'sigmaC': 1.0, 'num_samples': 2E3 } self.parms_dict = { 'freq': self.ex_freq, 'num_x_steps': self.num_x_steps, 'r_extra': self.r_extra } self.parms_dict.update(bayesian_parms) self.process_name = 'Bayesian_Inference' self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates( ) # Should not be extracting excitation this way! h5_spec_vals = self.h5_main.h5_spec_vals[0] self.single_ao = np.squeeze(h5_spec_vals[()]) roll_cyc_fract = -0.25 self.roll_pts = int(self.single_ao.size * roll_cyc_fract) self.rolled_bias = np.roll(self.single_ao, self.roll_pts) dt = 1 / (ex_freq * self.single_ao.size) self.dvdt = np.diff(self.single_ao) / dt self.dvdt = np.append(self.dvdt, self.dvdt[-1]) self.reverse_results = None self.forward_results = None self._bayes_parms = None self.__first_batch = True