def reshape_from_lines_to_pixels(h5_main, pts_per_cycle, scan_step_x_m=None): """ Breaks up the provided raw G-mode dataset into lines and pixels (from just lines) Parameters ---------- h5_main : h5py.Dataset object Reference to the main dataset that contains the raw data that is only broken up by lines pts_per_cycle : unsigned int Number of points in a single pixel scan_step_x_m : float Step in meters for pixels Returns ------- h5_resh : h5py.Dataset object Reference to the main dataset that contains the reshaped data """ if not check_if_main(h5_main): raise TypeError('h5_main is not a Main dataset') h5_main = USIDataset(h5_main) if pts_per_cycle % 1 != 0 or pts_per_cycle < 1: raise TypeError('pts_per_cycle should be a positive integer') if scan_step_x_m is not None: if not isinstance(scan_step_x_m, Number): raise TypeError('scan_step_x_m should be a real number') else: scan_step_x_m = 1 if h5_main.shape[1] % pts_per_cycle != 0: warn('Error in reshaping the provided dataset to pixels. Check points per pixel') raise ValueError num_cols = int(h5_main.shape[1] / pts_per_cycle) # TODO: DO NOT assume simple 1 spectral dimension! single_ao = np.squeeze(h5_main.h5_spec_vals[:, :pts_per_cycle]) spec_dims = Dimension(get_attr(h5_main.h5_spec_vals, 'labels')[0], get_attr(h5_main.h5_spec_vals, 'units')[0], single_ao) # TODO: DO NOT assume simple 1D in positions! pos_dims = [Dimension('X', 'm', np.linspace(0, scan_step_x_m, num_cols)), Dimension('Y', 'm', np.linspace(0, h5_main.h5_pos_vals[1, 0], h5_main.shape[0]))] h5_group = create_results_group(h5_main, 'Reshape') # TODO: Create empty datasets and then write for very large datasets h5_resh = write_main_dataset(h5_group, (num_cols * h5_main.shape[0], pts_per_cycle), 'Reshaped_Data', get_attr(h5_main, 'quantity')[0], get_attr(h5_main, 'units')[0], pos_dims, spec_dims, chunks=(10, pts_per_cycle), dtype=h5_main.dtype, compression=h5_main.compression) # TODO: DON'T write in one shot assuming small datasets fit in memory! print('Starting to reshape G-mode line data. Please be patient') h5_resh[()] = np.reshape(h5_main[()], (-1, pts_per_cycle)) print('Finished reshaping G-mode line data to rows and columns') return USIDataset(h5_resh)
def __init__(self, h5_main, cores=None, max_mem_mb=4 * 1024, verbose=False): """ Parameters ---------- h5_main : h5py.Dataset instance The dataset over which the analysis will be performed. This dataset should be linked to the spectroscopic indices and values, and position indices and values datasets. cores : uint, optional Default - all available cores - 2 How many cores to use for the computation max_mem_mb : uint, optional How much memory to use for the computation. Default 1024 Mb verbose : Boolean, (Optional, default = False) Whether or not to print debugging statements """ if h5_main.file.mode != 'r+': raise TypeError( 'Need to ensure that the file is in r+ mode to write results back to the file' ) if MPI is not None: # If we came here then, the user has intentionally asked for multi-node computation comm = MPI.COMM_WORLD self.mpi_comm = comm self.mpi_rank = comm.Get_rank() self.mpi_size = comm.Get_size() print("Rank {} of {} on {} sees {} logical cores on the socket". format(comm.Get_rank(), comm.Get_size(), socket.gethostname(), cpu_count())) # First, ensure that cores=logical cores in node. No point being economical / considerate cores = psutil.cpu_count() # It is sufficient if just one rank checks all this. if verbose and self.mpi_rank == 0: print('Working on {} nodes via MPI'.format(self.mpi_size)) # Ensure that the file is opened in the correct comm or something if h5_main.file.driver != 'mpio': raise TypeError( 'The HDF5 file should have been opened with driver="mpio". Current driver = "{}"' .format(h5_main.file.driver)) """ # Not sure how to check for this correctly messg = None try: if h5_main.file.comm != comm: messg = 'The HDF5 file should have been opened with comm=MPI.COMM_WORLD. Currently comm={}'.format(h5_main.file.comm) except AttributeError: messg = 'The HDF5 file should have been opened with comm=MPI.COMM_WORLD' if messg is not None: raise TypeError(messg) """ else: if verbose: print('No mpi4py found. Asssuming single node computation') self.mpi_comm = None self.mpi_size = 1 self.mpi_rank = 0 # Checking if dataset is "Main" if self.mpi_rank == 0: if not check_if_main(h5_main, verbose=verbose): raise ValueError( 'Provided dataset is not a "Main" dataset with necessary ancillary datasets' ) # Not sure if we need a barrier here. # Saving these as properties of the object: self.h5_main = USIDataset(h5_main) self.verbose = verbose self._max_pos_per_read = None self._max_mem_mb = None # Now have to be careful here since the below properties are a function of the MPI rank self._start_pos = None self._rank_end_pos = None self._end_pos = None self.__assign_job_indices(start=0) # Determining the max size of the data that can be put into memory # all ranks go through this and they need to have this value any self._set_memory_and_cores(cores=cores, mem=max_mem_mb) self.duplicate_h5_groups = [] self.partial_h5_groups = [] self.process_name = None # Reset this in the extended classes self.parms_dict = None self._results = None self.h5_results_grp = None if self.mpi_rank == 0: print( 'Consider calling test() to check results before calling compute() which computes on the entire' ' dataset and writes back to the HDF5 file')
def create_empty_dataset(source_dset, dtype, dset_name, h5_group=None, new_attrs=None, skip_refs=False): """ Creates an empty dataset in the h5 file based on the provided dataset in the same or specified group Parameters ---------- source_dset : h5py.Dataset object Source object that provides information on the group and shape of the dataset dtype : dtype Data type of the fit / guess datasets dset_name : String / Unicode Name of the dataset h5_group : h5py.Group object, optional. Default = None Group within which this dataset will be created new_attrs : dictionary (Optional) Any new attributes that need to be written to the dataset skip_refs : boolean, optional Should ObjectReferences and RegionReferences be skipped when copying attributes from the `source_dset` Returns ------- h5_new_dset : h5py.Dataset object Newly created dataset """ import h5py from pyUSID.io.dtype_utils import validate_dtype from pyUSID.io.hdf_utils import copy_attributes, check_if_main, write_book_keeping_attrs from pyUSID import USIDataset import sys if sys.version_info.major == 3: unicode = str if not isinstance(source_dset, h5py.Dataset): raise TypeError('source_deset should be a h5py.Dataset object') _ = validate_dtype(dtype) if new_attrs is not None: if not isinstance(new_attrs, dict): raise TypeError('new_attrs should be a dictionary') else: new_attrs = dict() if h5_group is None: h5_group = source_dset.parent else: if not isinstance(h5_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_group should be a h5py.Group or h5py.File object') if not isinstance(dset_name, (str, unicode)): raise TypeError('dset_name should be a string') dset_name = dset_name.strip() if len(dset_name) == 0: raise ValueError('dset_name cannot be empty!') if '-' in dset_name: warn( 'dset_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(dset_name, dset_name.replace('-', '_'))) dset_name = dset_name.replace('-', '_') if dset_name in h5_group.keys(): if isinstance(h5_group[dset_name], h5py.Dataset): warn('A dataset named: {} already exists in group: {}'.format( dset_name, h5_group.name)) h5_new_dset = h5_group[dset_name] # Make sure it has the correct shape and dtype if any((source_dset.shape != h5_new_dset.shape, dtype != h5_new_dset.dtype)): warn( 'Either the shape (existing: {} desired: {}) or dtype (existing: {} desired: {}) of the dataset ' 'did not match with expectations. Deleting and creating a new one.' .format(h5_new_dset.shape, source_dset.shape, h5_new_dset.dtype, dtype)) del h5_new_dset, h5_group[dset_name] h5_new_dset = h5_group.create_dataset( dset_name, shape=source_dset.shape, dtype=dtype, chunks=source_dset.chunks) else: raise KeyError('{} is already a {} in group: {}'.format( dset_name, type(h5_group[dset_name]), h5_group.name)) else: h5_new_dset = h5_group.create_dataset(dset_name, shape=source_dset.shape, dtype=dtype, chunks=source_dset.chunks) # This should link the ancillary datasets correctly h5_new_dset = copy_attributes(source_dset, h5_new_dset, skip_refs=skip_refs) h5_new_dset.attrs.update(new_attrs) if check_if_main(h5_new_dset): h5_new_dset = USIDataset(h5_new_dset) # update book keeping attributes write_book_keeping_attrs(h5_new_dset) return h5_new_dset
def reshape_from_lines_to_pixels(h5_main, pts_per_cycle, scan_step_x_m=None): """ Breaks up the provided raw G-mode dataset into lines and pixels (from just lines) Parameters ---------- h5_main : h5py.Dataset object Reference to the main dataset that contains the raw data that is only broken up by lines pts_per_cycle : unsigned int Number of points in a single pixel scan_step_x_m : float Step in meters for pixels Returns ------- h5_resh : h5py.Dataset object Reference to the main dataset that contains the reshaped data """ if not check_if_main(h5_main): raise TypeError('h5_main is not a Main dataset') h5_main = USIDataset(h5_main) if pts_per_cycle % 1 != 0 or pts_per_cycle < 1: raise TypeError('pts_per_cycle should be a positive integer') if scan_step_x_m is not None: if not isinstance(scan_step_x_m, Number): raise TypeError('scan_step_x_m should be a real number') else: scan_step_x_m = 1 if h5_main.shape[1] % pts_per_cycle != 0: warn( 'Error in reshaping the provided dataset to pixels. Check points per pixel' ) raise ValueError num_cols = int(h5_main.shape[1] / pts_per_cycle) # TODO: DO NOT assume simple 1 spectral dimension! single_ao = np.squeeze(h5_main.h5_spec_vals[:, :pts_per_cycle]) spec_dims = Dimension( get_attr(h5_main.h5_spec_vals, 'labels')[0], get_attr(h5_main.h5_spec_vals, 'units')[0], single_ao) # TODO: DO NOT assume simple 1D in positions! pos_dims = [ Dimension('X', 'm', np.linspace(0, scan_step_x_m, num_cols)), Dimension('Y', 'm', np.linspace(0, h5_main.h5_pos_vals[1, 0], h5_main.shape[0])) ] h5_group = create_results_group(h5_main, 'Reshape') # TODO: Create empty datasets and then write for very large datasets h5_resh = write_main_dataset(h5_group, (num_cols * h5_main.shape[0], pts_per_cycle), 'Reshaped_Data', get_attr(h5_main, 'quantity')[0], get_attr(h5_main, 'units')[0], pos_dims, spec_dims, chunks=(10, pts_per_cycle), dtype=h5_main.dtype, compression=h5_main.compression) # TODO: DON'T write in one shot assuming small datasets fit in memory! print('Starting to reshape G-mode line data. Please be patient') h5_resh[()] = np.reshape(h5_main[()], (-1, pts_per_cycle)) print('Finished reshaping G-mode line data to rows and columns') return USIDataset(h5_resh)
def __init__(self, h5_main, *args, verbose=False, threading=False, **kwargs): """ Parameters ---------- h5_main : :class:`~pyUSID.io.usi_data.USIDataset` The USID main HDF5 dataset over which the analysis will be performed. verbose : bool, Optional, default = False Whether or not to print debugging statements """ if h5_main.file.mode != 'r+': raise TypeError( 'Need to ensure that the file is in r+ mode to write results back to the file' ) # Checking if dataset is "Main" if not check_if_main(h5_main): raise ValueError( 'Provided dataset is not a "Main" dataset with necessary ancillary datasets' ) # Saving these as properties of the object: #self.h5_main_chunks = h self.h5_main = USIDataset(h5_main) self.verbose = verbose self.threading = threading #set false for distributed self.dtype = h5_main.dtype #to redo dtype after dask array messes with it self.duplicate_h5_groups = [] self.partial_h5_groups = [] self.process_name = None # Reset this in the extended classes self.parms_dict = None """ The name of the HDF5 dataset that should be present to signify which positions have already been computed This is NOT a fully private variable so that multiple processes can be run within a single group - Eg Fitter In the case of Fitter - this name can be changed from 'completed_guesses' to 'completed_fits' check_for_duplicates will be called by the Child class where they have the opportunity to change this variable before checking for duplicates """ self._status_dset_name = 'completed_positions' self._results = None self.h5_results_grp = None # Check to see if the resuming feature has been implemented: self._resume_implemented = False try: self._get_existing_datasets() except NotImplementedError: if verbose: print( 'It appears that this class may not be able to resume computations' ) except: # NameError for variables that don't exist # AttributeError for self.var_name that don't exist # TypeError (NoneType) etc. self._resume_implemented = True print( 'Consider calling test() to check results before calling compute() which computes on the entire' ' dataset and writes back to the HDF5 file')