def _create_fit_datasets(self): """ Creates the HDF5 fit dataset. pycroscopy requires that the h5 group, guess dataset, corresponding spectroscopic and position datasets be created and populated at this point. This function will create the HDF5 dataset for the fit and link it to same ancillary datasets as the guess. The fit dataset will NOT be populated here but will instead be populated using the __setData function """ if self._h5_guess is None or self.h5_results_grp is None: warn('Need to guess before fitting!') return """ Once the guess is complete, the last_pixel attribute will be set to complete for the group. Once the fit is initiated, during the creation of the status dataset, this last_pixel attribute will be used and it wil make the fit look like it was already complete. Which is not the case. This is a problem of doing two processes within the same group. Until all legacy is removed, we will simply reset the last_pixel attribute. """ self.h5_results_grp.attrs['last_pixel'] = 0 write_simple_attrs(self.h5_results_grp, self.parms_dict) # Create the fit dataset as an empty dataset of the same size and dtype # as the guess. # Also automatically links in the ancillary datasets. self._h5_fit = USIDataset( create_empty_dataset(self._h5_guess, dtype=sho32, dset_name='Fit')) self._h5_fit.file.flush() if self.verbose and self.mpi_rank == 0: print('Finished creating Fit dataset')
def write_results(h5_group, dataset=None, attributes=None, process_name=None): found_valid_dataset = False if dataset is not None: if isinstance(dataset, Dataset): found_valid_dataset = True found_valid_attributes = False if attributes is not None: if isinstance(attributes, dict): if len(attributes) > 0: found_valid_attributes = True if not (found_valid_dataset or found_valid_attributes): raise ValueError( 'results should contain at least a sidpy Dataset or a dictionary in results' ) log_name = 'Log_' if process_name is not None: log_name = log_name + process_name log_group = create_indexed_group(h5_group, log_name) if found_valid_dataset: write_nsid_dataset(dataset, log_group) if found_valid_attributes: write_simple_attrs(log_group, flatten_dict(attributes)) return log_group
def create_hdf5_file(self, append_path='', grp_name='Measurement', overwrite=False): if not append_path: h5_path = os.path.join(self.directory, self.basename.replace('.txt', '.h5')) if os.path.exists(h5_path): if not overwrite: raise FileExistsError( 'This file already exists). Set attribute overwrite to True' ) else: print('Overwriting file', h5_path) #os.remove(h5_path) self.h5_f = h5py.File(h5_path, mode='w') else: if not os.path.exists(append_path): raise Exception('File does not exist. Check pathname.') self.h5_f = h5py.File(append_path, mode='r+') self.h5_meas_grp = create_indexed_group(self.h5_f, grp_name) write_simple_attrs(self.h5_meas_grp, self.params_dictionary) return
def make_simple_nsid_dataset(*args, **kwargs): """ h5 dataset which is fully pyNSID compatible """ with tempfile.TemporaryDirectory() as tmp_dir: file_path = tmp_dir + 'nsid_simple.h5' h5_file = h5py.File(file_path, 'a') h5_group = h5_file.create_group('MyGroup') dsetnames = kwargs.get("dsetnames", ['data']) dsetshapes = kwargs.get("dsetshapes") if dsetshapes is None: dsetshapes = [(2, 3) for i in range(len(dsetnames))] for i, d in enumerate(dsetnames): data = np.random.normal(size=dsetshapes[i]) h5_dataset = h5_group.create_dataset(d, data=data) attrs_to_write = { 'quantity': 'quantity', 'units': 'units', 'pyNSID_version': 'version', 'main_data_name': 'title', 'data_type': 'UNKNOWN', 'modality': 'modality', 'source': 'test' } if len(args) > 0: for k, v in args[0].items(): if k in attrs_to_write: attrs_to_write[k] = v write_simple_attrs(h5_dataset, attrs_to_write) dims = { 0: h5_group.create_dataset("a{}".format(i), data=np.arange(data.shape[0])), 1: h5_group.create_dataset("b{}".format(i), data=np.arange(data.shape[1])) } for dim, this_dim_dset in dims.items(): name = this_dim_dset.name.split('/')[-1] attrs_to_write = { 'name': name, 'units': 'units', 'quantity': 'quantity', 'dimension_type': 'dimension_type.name', 'nsid_version': 'test' } write_simple_attrs(this_dim_dset, attrs_to_write) this_dim_dset.make_scale(name) h5_dataset.dims[dim].label = name h5_dataset.dims[dim].attach_scale(this_dim_dset) return h5_file
def _translate_force_map(self, h5_meas_grp): """ Reads the scan image + force map from the proprietary file and writes it to HDF5 datasets Parameters ---------- h5_meas_grp : h5py.Group object Reference to the measurement group """ # First lets write the image into the measurement group that has already been created: image_parms = self.meta_data['Ciao image list'] quantity = image_parms.pop('Image Data_2') image_mat = self._read_image_layer(image_parms) h5_chan_grp = create_indexed_group(h5_meas_grp, 'Channel') write_main_dataset( h5_chan_grp, np.reshape(image_mat, (-1, 1)), 'Raw_Data', # Quantity and Units needs to be fixed by someone who understands these files better quantity, 'a. u.', [ Dimension('X', 'nm', image_parms['Samps/line']), Dimension('Y', 'nm', image_parms['Number of lines']) ], Dimension('single', 'a. u.', 1), dtype=np.float32, compression='gzip') # Think about standardizing attributes for rows and columns write_simple_attrs(h5_chan_grp, image_parms) # Now work on the force map: force_map_parms = self.meta_data['Ciao force image list'] quantity = force_map_parms.pop('Image Data_4') force_map_vec = self._read_data_vector(force_map_parms) tr_rt = [ int(item) for item in force_map_parms['Samps/line'].split(' ') ] force_map_2d = force_map_vec.reshape(image_mat.size, np.sum(tr_rt)) h5_chan_grp = create_indexed_group(h5_meas_grp, 'Channel') write_main_dataset( h5_chan_grp, force_map_2d, 'Raw_Data', # Quantity and Units needs to be fixed by someone who understands these files better quantity, 'a. u.', [ Dimension('X', 'nm', image_parms['Samps/line']), Dimension('Y', 'nm', image_parms['Number of lines']) ], Dimension('Z', 'nm', int(np.sum(tr_rt))), dtype=np.float32, compression='gzip') # Think about standardizing attributes write_simple_attrs(h5_chan_grp, force_map_parms)
def create_results_group(h5_main, tool_name, h5_parent_group=None): """ Creates a h5py.Group object autoindexed and named as 'DatasetName-ToolName_00x' Parameters ---------- h5_main : h5py.Dataset object Reference to the dataset based on which the process / analysis is being performed tool_name : string / unicode Name of the Process / Analysis applied to h5_main h5_parent_group : h5py.Group, optional. Default = None Parent group under which the results group will be created. Use this option to write results into a new HDF5 file. By default, results will be written into the same group containing `h5_main` Returns ------- h5_group : :class:`h5py.Group` Results group which can now house the results datasets """ if not isinstance(h5_main, h5py.Dataset): raise TypeError('h5_main should be a h5py.Dataset object') if h5_parent_group is not None: if not isinstance(h5_parent_group, (h5py.File, h5py.Group)): raise TypeError("'h5_parent_group' should either be a h5py.File " "or h5py.Group object") else: h5_parent_group = h5_main.parent tool_name = validate_single_string_arg(tool_name, 'tool_name') if '-' in tool_name: warn('tool_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(tool_name, tool_name.replace('-', '_'))) tool_name = tool_name.replace('-', '_') group_name = h5_main.name.split('/')[-1] + '-' + tool_name + '_' group_name = assign_group_index(h5_parent_group, group_name) h5_group = h5_parent_group.create_group(group_name) write_book_keeping_attrs(h5_group) # Also add some basic attributes like source and tool name. This will allow relaxation of nomenclature restrictions: # this are NOT being used right now but will be in the subsequent versions of pyUSID write_simple_attrs(h5_group, {'tool': tool_name, 'num_source_dsets': 1}) # in this case, there is only one source if h5_parent_group.file == h5_main.file: for dset_ind, dset in enumerate([h5_main]): h5_group.attrs['source_' + '{:03d}'.format(dset_ind)] = dset.ref return h5_group
def _translate_gwy(self, file_path, meas_grp): """ Parameters ---------- file_path meas_grp For more information on the .gwy file format visit the link below - http://gwyddion.net/documentation/user-guide-en/gwyfile-format.html """ # Need to build a set of channels to test against and a function-level variable to write to channels = {} # Read the data in from the specified file gwy_data = gwyfile.load(file_path) for obj in gwy_data: gwy_key = obj.split('/') try: # if the second index of the gwy_key can be cast into an int then # it needs to be processed either as an image or a graph int(gwy_key[1]) if gwy_key[2] == 'graph': # graph processing self.global_parms['data_type'] = 'GwyddionGWY_' + 'Graph' channels = self._translate_graph(meas_grp, gwy_data, obj, channels) elif obj.endswith('data'): self.global_parms['data_type'] = 'GwyddionGWY_' + 'Image' channels = self._translate_image_stack( meas_grp, gwy_data, obj, channels) else: continue except ValueError: # if the second index of the gwy_key cannot be cast into an int # then it needs to be processed wither as a spectra, volume or xyz if gwy_key[1] == 'sps': self.global_parms['data_type'] = 'GwyddionGWY_' + 'Spectra' channels = self._translate_spectra(meas_grp, gwy_data, obj, channels) elif gwy_key[1] == 'brick': self.global_parms['data_type'] = 'GwyddionGWY_' + 'Volume' channels = self._translate_volume(meas_grp, gwy_data, obj, channels) elif gwy_key[1] == 'xyz': self.global_parms['data_type'] = 'GwyddionGWY_' + 'XYZ' channels = self._translate_xyz(meas_grp, gwy_data, obj, channels) write_simple_attrs(meas_grp.parent, self.global_parms)
def _write_source_dset_provenance(self): """ Writes path of HDF5 file and path of h5_main to the results group if results are being written to a new HDF5 file """ if self.h5_main.file == self.h5_results_grp.file: return write_simple_attrs( self.h5_results_grp, { 'source_file_path': self.h5_main.file.filename, 'source_dataset_path': self.h5_main.name })
def _write_results_chunk(self): """ Writes the provided SVD results to file Parameters ---------- """ comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s)) h5_svd_group = create_results_group(self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) self.h5_results_grp = h5_svd_group self._write_source_dset_provenance() write_simple_attrs(h5_svd_group, self.parms_dict) write_simple_attrs(h5_svd_group, {'svd_method': 'sklearn-randomized'}) h5_u = write_main_dataset(h5_svd_group, np.float32(self.__u), 'U', 'Abundance', 'a.u.', None, comp_dim, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, dtype=np.float32, chunks=calc_chunks(self.__u.shape, np.float32(0).itemsize)) # print(get_attr(self.h5_main, 'quantity')[0]) h5_v = write_main_dataset(h5_svd_group, self.__v, 'V', get_attr(self.h5_main, 'quantity')[0], 'a.u.', comp_dim, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals, chunks=calc_chunks(self.__v.shape, self.h5_main.dtype.itemsize)) # No point making this 1D dataset a main dataset h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s)) ''' Check h5_main for plot group references. Copy them into V if they exist ''' for key in self.h5_main.attrs.keys(): if '_Plot_Group' not in key: continue ref_inds = get_indices_for_region_ref(self.h5_main, self.h5_main.attrs[key], return_method='corners') ref_inds = ref_inds.reshape([-1, 2, 2]) ref_inds[:, 1, 0] = h5_v.shape[0] - 1 svd_ref = create_region_reference(h5_v, ref_inds) h5_v.attrs[key] = svd_ref # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = h5_svd_group.create_dataset(self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: h5_svd_group.attrs['last_pixel'] = self.h5_main.shape[0]
def _write_dset_attributes(h5_dset, attrs, print_log=False): """ Writes attributes to a h5py dataset Parameters ---------- h5_dset : h5py.Dataset object h5py dataset to which the attributes will be written to. This function handles region references as well attrs : dict Dictionary containing the attributes as key-value pairs print_log : bool, optional. Default=False Whether or not to print debugging statements """ if not isinstance(attrs, dict): HDFwriter.__safe_abort(h5_dset.file) raise TypeError( 'attrs should be a dictionary but is instead of type ' '{}'.format(type(attrs))) if not isinstance(h5_dset, h5py.Dataset): raise TypeError( 'h5_dset should be a h5py Dataset object but is instead of type ' '{}. UNABLE to safely abort'.format(type(h5_dset))) # First, set aside the complicated attribute(s) attr_dict = attrs.copy() labels_dict = attr_dict.pop('labels', None) # Next, write the simple ones using a centralized function write_simple_attrs(h5_dset, attr_dict, obj_type='dataset', verbose=print_log) if labels_dict is None: if print_log: print('Finished writing all attributes of dataset') return if isinstance(labels_dict, (tuple, list)): # What if the labels dictionary is just a list of names? make a dictionary using the names # This is the most that can be done. labels_dict = attempt_reg_ref_build(h5_dset, labels_dict, verbose=print_log) if len(labels_dict) == 0: if print_log: warn('No region references to write') return # Now, handle the region references attribute: write_region_references(h5_dset, labels_dict, verbose=print_log)
def _translate_image_stack(self, h5_meas_grp): """ Reads the scan images from the proprietary file and writes them to HDF5 datasets Parameters ---------- h5_meas_grp : h5py.Group object Reference to the measurement group """ # since multiple channels will share the same position and spectroscopic dimensions, why not share them? h5_spec_inds, h5_spec_vals = write_ind_val_dsets(h5_meas_grp, Dimension( 'single', 'a. u.', 1), is_spectral=True) # Find out the size of the force curves from the metadata: layer_info = None for class_name in self.meta_data.keys(): if 'Ciao image list' in class_name: layer_info = self.meta_data[class_name] break h5_pos_inds, h5_pos_vals = write_ind_val_dsets(h5_meas_grp, [ Dimension('X', 'nm', layer_info['Samps/line']), Dimension('Y', 'nm', layer_info['Number of lines']) ], is_spectral=False) for class_name in self.meta_data.keys(): if 'Ciao image list' in class_name: layer_info = self.meta_data[class_name] quantity = layer_info.pop('Image Data_2') data = self._read_image_layer(layer_info) h5_chan_grp = create_indexed_group(h5_meas_grp, 'Channel') write_main_dataset( h5_chan_grp, np.reshape(data, (-1, 1)), 'Raw_Data', # Quantity and Units needs to be fixed by someone who understands these files better quantity, 'a. u.', None, None, dtype=np.float32, compression='gzip', h5_pos_inds=h5_pos_inds, h5_pos_vals=h5_pos_vals, h5_spec_inds=h5_spec_inds, h5_spec_vals=h5_spec_vals) # Think about standardizing attributes for rows and columns write_simple_attrs(h5_chan_grp, layer_info)
def _translate_force_curve(self, h5_meas_grp): """ Reads the force curves from the proprietary file and writes them to HDF5 datasets Parameters ---------- h5_meas_grp : h5py.Group object Reference to the measurement group """ # since multiple channels will share the same position and spectroscopic dimensions, why not share them? h5_pos_inds, h5_pos_vals = write_ind_val_dsets(h5_meas_grp, Dimension( 'single', 'a. u.', 1), is_spectral=False) # Find out the size of the force curves from the metadata: layer_info = None for class_name in self.meta_data.keys(): if 'Ciao force image list' in class_name: layer_info = self.meta_data[class_name] break tr_rt = [int(item) for item in layer_info['Samps/line'].split(' ')] h5_spec_inds, h5_spec_vals = write_ind_val_dsets( h5_meas_grp, Dimension('Z', 'nm', int(np.sum(tr_rt))), is_spectral=True) for class_name in self.meta_data.keys(): if 'Ciao force image list' in class_name: layer_info = self.meta_data[class_name] quantity = layer_info.pop('Image Data_4') data = self._read_data_vector(layer_info) h5_chan_grp = create_indexed_group(h5_meas_grp, 'Channel') write_main_dataset( h5_chan_grp, np.expand_dims(data, axis=0), 'Raw_Data', # Quantity and Units needs to be fixed by someone who understands these files better quantity, 'a. u.', None, None, dtype=np.float32, compression='gzip', h5_pos_inds=h5_pos_inds, h5_pos_vals=h5_pos_vals, h5_spec_inds=h5_spec_inds, h5_spec_vals=h5_spec_vals) # Think about standardizing attributes write_simple_attrs(h5_chan_grp, layer_info)
def write_book_keeping_attrs(h5_obj): """ Writes basic book-keeping and posterity related attributes to groups created in pyNSID such as machine id, pyNSID version, timestamp. Parameters ---------- h5_obj : :class:`h5py.Dataset`, :class:`h5py.Group`, or :class:`h5py.File` Object to which basic book-keeping attributes need to be written """ hut.write_book_keeping_attrs(h5_obj) hut.write_simple_attrs(h5_obj, {'pyNSID_version': py_nsid_version})
def write_pynsid_book_keeping_attrs(h5_object): """ Writes book-keeping information to the HDF5 object Parameters ---------- h5_object Returns ------- """ write_book_keeping_attrs(h5_object) write_simple_attrs(h5_object, {'pyNSID_version': pynsid_version})
def _create_guess_datasets(self): """ Creates the h5 group, guess dataset, corresponding spectroscopic datasets and also links the guess dataset to the spectroscopic datasets. """ self.h5_results_grp = create_results_group( self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) write_simple_attrs(self.h5_results_grp, self.parms_dict) # If writing to a new HDF5 file: # Add back the data_type attribute - still being used in the visualizer if self.h5_results_grp.file != self.h5_main.file: write_simple_attrs( self.h5_results_grp.file, {'data_type': get_attr(self.h5_main.file, 'data_type')}) ret_vals = write_reduced_anc_dsets(self.h5_results_grp, self.h5_main.h5_spec_inds, self.h5_main.h5_spec_vals, self._fit_dim_name, verbose=self.verbose) h5_sho_inds, h5_sho_vals = ret_vals self._h5_guess = write_main_dataset( self.h5_results_grp, (self.h5_main.shape[0], self.num_udvs_steps), 'Guess', 'SHO', 'compound', None, None, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, h5_spec_inds=h5_sho_inds, h5_spec_vals=h5_sho_vals, chunks=(1, self.num_udvs_steps), dtype=sho32, main_dset_attrs=self.parms_dict, verbose=self.verbose) # Does not make sense to propagate region refs - nobody uses them # copy_region_refs(self.h5_main, self._h5_guess) self._h5_guess.file.flush() if self.verbose and self.mpi_rank == 0: print('Finished creating Guess dataset')
def _create_root_image(self, image_path): """ Create the Groups and Datasets for a single root image Parameters ---------- image_path : str Path to the image file Returns ------- None """ image, image_parms = read_dm3(image_path) if image.ndim == 3: image = np.sum(image, axis=0) ''' Create the Measurement and Channel Groups to hold the image Datasets ''' meas_grp = create_indexed_group(self.h5_f, 'Measurement') chan_grp = create_indexed_group(meas_grp, 'Channel') ''' Set the Measurement Group attributes ''' usize, vsize = image.shape image_parms['image_size_u'] = usize image_parms['image_size_v'] = vsize image_parms['translator'] = 'OneView' image_parms['num_pixels'] = image.size write_simple_attrs(meas_grp, image_parms) ''' Build Spectroscopic and Position dimensions ''' spec_desc = Dimension('Image', 'a.u.', [1]) pos_desc = [ Dimension('X', 'pixel', np.arange(image.shape[0])), Dimension('Y', 'pixel', np.arange(image.shape[1])) ] h5_image = write_main_dataset(chan_grp, np.reshape(image, (-1, 1)), 'Raw_Data', 'Intensity', 'a.u.', pos_desc, spec_desc) self.root_image_list.append(h5_image)
def translate(self, file_path, *args, **kwargs): # Two kinds of files: # 1. Simple GSF files -> use metadata, data = gsf_read(file_path) # 2. Native .gwy files -> use the gwyfile package # I have a notebook that shows how such data can be read. # Create the .h5 file from the input file if not isinstance(file_path, (str, unicode)): raise TypeError('file_path should be a string!') if not (file_path.endswith('.gsf') or file_path.endswith('.gwy')): # TODO: Gwyddion is weird, it doesn't append the file extension some times. # In theory, you could identify the kind of file by looking at the header (line 38 in gsf_read()). # Ideally the header check should be used instead of the extension check raise ValueError('file_path must have a .gsf or .gwy extension!') file_path = path.abspath(file_path) folder_path, base_name = path.split(file_path) base_name = base_name[:-4] h5_path = path.join(folder_path, base_name + '.h5') if path.exists(h5_path): remove(h5_path) self.h5_file = h5py.File(h5_path, 'w') """ Setup the global parameters --------------------------- translator: Gywddion data_type: depends on file type GwyddionGSF_<gsf_meta['title']> or GwyddionGWY_<gwy_meta['title']> """ self.global_parms = dict() self.global_parms['translator'] = 'Gwyddion' # Create the measurement group meas_grp = create_indexed_group(self.h5_file, 'Measurement') if file_path.endswith('.gsf'): self._translate_gsf(file_path, meas_grp) if file_path.endswith('gwy'): self._translate_gwy(file_path, meas_grp) write_simple_attrs(self.h5_file, self.global_parms) return h5_path
def get_dim_dict(dims: Tuple[int]) -> Dict[int, h5py.Dataset]: h5_f = h5py.File('test2.h5', 'a') h5_group = h5_f.create_group('MyGroup2') dim_dict = {} names = ['X', 'Y', 'Z', 'F'] for i, d in enumerate(dims): dim_dict[i] = h5_group.create_dataset(names[i], data=np.arange(d)) for dim, this_dim_dset in dim_dict.items(): name = this_dim_dset.name.split('/')[-1] attrs_to_write = { 'name': name, 'units': 'units', 'quantity': 'quantity', 'dimension_type': 'dimension_type.name', 'nsid_version': 'test' } write_simple_attrs(this_dim_dset, attrs_to_write) return dim_dict
def make_nsid_dataset_no_dim_attached(): """ except for the dimensions attached the h5 dataset which is fully pyNSID compatible """ with tempfile.TemporaryDirectory() as tmp_dir: file_path = tmp_dir + 'nsid_simple.h5' h5_file = h5py.File(file_path, 'a') h5_group = h5_file.create_group('MyGroup') data = np.random.normal(size=(2, 3)) h5_dataset = h5_group.create_dataset('data', data=data) attrs_to_write = { 'quantity': 'quantity', 'units': 'units', 'nsid_version': 'version', 'main_data_name': 'title', 'data_type': 'UNKNOWN', 'modality': 'modality', 'source': 'test' } write_simple_attrs(h5_dataset, attrs_to_write) dims = { 0: h5_group.create_dataset('a', data=np.arange(data.shape[0])), 1: h5_group.create_dataset('b', data=np.arange(data.shape[1])) } for dim, this_dim_dset in dims.items(): name = this_dim_dset.name.split('/')[-1] attrs_to_write = { 'name': name, 'units': 'units', 'quantity': 'quantity', 'dimension_type': 'dimension_type.name', 'nsid_version': 'test' } write_simple_attrs(this_dim_dset, attrs_to_write) this_dim_dset.make_scale(name) h5_dataset.dims[dim].label = name # h5_dataset.dims[dim].attach_scale(this_dim_dset) return h5_file
def _setupH5(self, image_parms): """ Setup the HDF5 file in which to store the data Due to the structure of the ndata format, we can only create the Measurement and Channel groups here Parameters ---------- image_parms : dict Dictionary of parameters Returns ------- h5_main : h5py.Dataset HDF5 Dataset that the images will be written into h5_mean_spec : h5py.Dataset HDF5 Dataset that the mean over all positions will be written into h5_ronch : h5py.Dataset HDF5 Dateset that the mean over all Spectroscopic steps will be written into """ root_parms = dict() root_parms['data_type'] = 'PtychographyData' # Create the hdf5 data Group write_simple_attrs(self.h5_f, root_parms) h5_channels = list() for meas_parms in image_parms: # Create new measurement group for each set of parameters meas_grp = create_indexed_group(self.h5_f, 'Measurement') # Write the parameters as attributes of the group write_simple_attrs(meas_grp, meas_parms) chan_grp = create_indexed_group(meas_grp, 'Channel') h5_channels.append(chan_grp) self.h5_f.flush() return h5_channels
def write_simple_attrs(h5_obj, attrs, obj_type='', verbose=False): """ Writes attributes to a h5py object Parameters ---------- h5_obj : :class:`h5py.File`, :class:`h5py.Group`, or h5py.Dataset object h5py object to which the attributes will be written to attrs : dict Dictionary containing the attributes as key-value pairs obj_type : str or unicode, optional. Default = '' type of h5py.obj. Examples include 'group', 'file', 'dataset verbose : bool, optional. Default=False Whether or not to print debugging statements """ warn('pyUSID.io.hdf_utils.write_simple_attrs has been moved to ' 'sidpy.hdf.hdf_utils.write_simple_attrs. This copy in pyUSID will' 'be removed in future release. Please update your import statements') return hut.write_simple_attrs(h5_obj, attrs, obj_type=obj_type, verbose=verbose)
def _create_results_datasets(self): """ Creates hdf5 datasets and datagroups to hold the resutls """ # create all h5 datasets here: num_pos = self.h5_main.shape[0] if self.verbose and self.mpi_rank == 0: print('Now creating the datasets') self.h5_results_grp = create_results_group( self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) write_simple_attrs(self.h5_results_grp, { 'algorithm_author': 'Kody J. Law', 'last_pixel': 0 }) write_simple_attrs(self.h5_results_grp, self.parms_dict) if self.verbose and self.mpi_rank == 0: print('created group: {} with attributes:'.format( self.h5_results_grp.name)) print(get_attributes(self.h5_results_grp)) # One of those rare instances when the result is exactly the same as the source self.h5_i_corrected = create_empty_dataset( self.h5_main, np.float32, 'Corrected_Current', h5_group=self.h5_results_grp) if self.verbose and self.mpi_rank == 0: print('Created I Corrected') # print_tree(self.h5_results_grp) # For some reason, we cannot specify chunks or compression! # The resistance dataset requires the creation of a new spectroscopic dimension self.h5_resistance = write_main_dataset( self.h5_results_grp, (num_pos, self.num_x_steps), 'Resistance', 'Resistance', 'GOhms', None, Dimension('Bias', 'V', self.num_x_steps), dtype=np. float32, # chunks=(1, self.num_x_steps), #compression='gzip', h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals) if self.verbose and self.mpi_rank == 0: print('Created Resistance') # print_tree(self.h5_results_grp) assert isinstance(self.h5_resistance, USIDataset) # only here for PyCharm self.h5_new_spec_vals = self.h5_resistance.h5_spec_vals # The variance is identical to the resistance dataset self.h5_variance = create_empty_dataset(self.h5_resistance, np.float32, 'R_variance') if self.verbose and self.mpi_rank == 0: print('Created Variance') # print_tree(self.h5_results_grp) # The capacitance dataset requires new spectroscopic dimensions as well self.h5_cap = write_main_dataset( self.h5_results_grp, (num_pos, 1), 'Capacitance', 'Capacitance', 'pF', None, Dimension('Direction', '', [1]), h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, dtype=cap_dtype, #compression='gzip', aux_spec_prefix='Cap_Spec_') if self.verbose and self.mpi_rank == 0: print('Created Capacitance') # print_tree(self.h5_results_grp) print('Done creating all results datasets!') if self.mpi_size > 1: self.mpi_comm.Barrier() self.h5_main.file.flush()
def translate(self, parm_path): """ The main function that translates the provided file into a .h5 file Parameters ------------ parm_path : string / unicode Absolute file path of the parameters .mat file. Returns ---------- h5_path : string / unicode Absolute path of the translated h5 file """ parm_path = path.abspath(parm_path) parm_dict, excit_wfm = self._read_parms(parm_path) excit_wfm = excit_wfm[1::2] self._parse_file_path(parm_path) num_dat_files = len(self.file_list) f = open(self.file_list[0], 'rb') spectrogram_size, count_vals = self._parse_spectrogram_size(f) print("Excitation waveform shape: ", excit_wfm.shape) print("spectrogram size:", spectrogram_size) num_pixels = parm_dict['grid_num_rows'] * parm_dict['grid_num_cols'] print('Number of pixels: ', num_pixels) print('Count Values: ', count_vals) # if (num_pixels + 1) != count_vals: # print("Data size does not match number of pixels expected. Cannot continue") # Find how many channels we have to make num_ai_chans = num_dat_files // 2 # Division by 2 due to real/imaginary # Now start creating datasets and populating: # Start with getting an h5 file h5_file = h5py.File(self.h5_path) # First create a measurement group h5_meas_group = create_indexed_group(h5_file, 'Measurement') # Set up some parameters that will be written as attributes to this Measurement group global_parms = dict() global_parms['data_type'] = 'trKPFM' global_parms['translator'] = 'trKPFM' write_simple_attrs(h5_meas_group, global_parms) write_simple_attrs(h5_meas_group, parm_dict) # Now start building the position and spectroscopic dimension containers # There's only one spectroscpoic dimension and two position dimensions # The excit_wfm only has the DC values without any information on cycles, time, etc. # What we really need is to add the time component. For every DC step there are some time steps. num_time_steps = ( spectrogram_size - 5 ) // excit_wfm.size // 2 # Need to divide by 2 because it considers on and off field # There should be three spectroscopic axes # In order of fastest to slowest varying, we have # time, voltage, field time_vec = np.linspace(0, parm_dict['IO_time'], num_time_steps) print('Num time steps: {}'.format(num_time_steps)) print('DC Vec size: {}'.format(excit_wfm.shape)) print('Spectrogram size: {}'.format(spectrogram_size)) field_vec = np.array([0, 1]) spec_dims = [ Dimension('Time', 's', time_vec), Dimension('Field', 'Binary', field_vec), Dimension('Bias', 'V', excit_wfm) ] pos_dims = [ Dimension('Cols', 'm', int(parm_dict['grid_num_cols'])), Dimension('Rows', 'm', int(parm_dict['grid_num_rows'])) ] self.raw_datasets = list() for chan_index in range(num_ai_chans): chan_grp = create_indexed_group(h5_meas_group, 'Channel') if chan_index == 0: write_simple_attrs(chan_grp, {'Harmonic': 1}) else: write_simple_attrs(chan_grp, {'Harmonic': 2}) h5_raw = write_main_dataset( chan_grp, # parent HDF5 group (num_pixels, spectrogram_size - 5), # shape of Main dataset 'Raw_Data', # Name of main dataset 'Deflection', # Physical quantity contained in Main dataset 'V', # Units for the physical quantity pos_dims, # Position dimensions spec_dims, # Spectroscopic dimensions dtype=np.complex64, # data type / precision compression='gzip', chunks=(1, spectrogram_size - 5), main_dset_attrs={'quantity': 'Complex'}) # h5_refs = hdf.write(chan_grp, print_log=False) # h5_raw = get_h5_obj_refs(['Raw_Data'], h5_refs)[0] # link_h5_objects_as_attrs(h5_raw, get_h5_obj_refs(aux_ds_names, h5_refs)) self.raw_datasets.append(h5_raw) self.raw_datasets.append(h5_raw) # Now that the N channels have been made, populate them with the actual data.... self._read_data(parm_dict, parm_path, spectrogram_size) h5_file.file.close() # hdf.close() return self.h5_path
def _setup_h5(self, data_gen_parms): """ Setups up the hdf5 file structure before doing the actual generation Parameters ---------- data_gen_parms : dict Dictionary containing the parameters to write to the Measurement Group as attributes Returns ------- """ ''' Build the group structure down to the channel group ''' # Set up the basic group structure root_parms = dict() root_parms['translator'] = 'FAKEBEPS' root_parms['data_type'] = data_gen_parms['data_type'] # Write the file self.h5_f = h5py.File(self.h5_path, 'w') write_simple_attrs(self.h5_f, root_parms) meas_grp = create_indexed_group(self.h5_f, 'Measurement') chan_grp = create_indexed_group(meas_grp, 'Channel') write_simple_attrs(meas_grp, data_gen_parms) # Create the Position and Spectroscopic datasets for the Raw Data h5_pos_dims, h5_spec_dims = self._build_ancillary_datasets() h5_raw_data = write_main_dataset(chan_grp, (self.n_pixels, self.n_spec_bins), 'Raw_Data', 'Deflection', 'Volts', h5_pos_dims, h5_spec_dims, slow_to_fast=True, dtype=np.complex64, verbose=True) ''' Build the SHO Group ''' sho_grp = create_results_group(h5_raw_data, 'SHO_Fit') # Build the Spectroscopic datasets for the SHO Guess and Fit h5_sho_spec_inds, h5_sho_spec_vals = write_reduced_anc_dsets( sho_grp, h5_raw_data.h5_spec_inds, h5_raw_data.h5_spec_vals, 'Frequency', is_spec=True) h5_sho_fit = write_main_dataset( sho_grp, (self.n_pixels, int(self.n_spec_bins // self.n_bins)), 'Fit', 'SHO Parameters', 'a.u.', None, None, h5_pos_inds=h5_raw_data.h5_pos_inds, h5_pos_vals=h5_raw_data.h5_pos_vals, h5_spec_inds=h5_sho_spec_inds, h5_spec_vals=h5_sho_spec_vals, slow_to_fast=True, dtype=sho32) h5_sho_guess = copy_dataset(h5_sho_fit, sho_grp, alias='Guess') ''' Build the loop group ''' loop_grp = create_results_group(h5_sho_fit, 'Loop_Fit') # Build the Spectroscopic datasets for the loops h5_loop_spec_inds, h5_loop_spec_vals = write_reduced_anc_dsets( loop_grp, h5_sho_fit.h5_spec_inds, h5_sho_fit.h5_spec_vals, 'DC_Offset', is_spec=True) h5_loop_fit = write_main_dataset(loop_grp, (self.n_pixels, self.n_loops), 'Fit', 'Loop Fitting Parameters', 'a.u.', None, None, h5_pos_inds=h5_raw_data.h5_pos_inds, h5_pos_vals=h5_raw_data.h5_pos_vals, h5_spec_inds=h5_loop_spec_inds, h5_spec_vals=h5_loop_spec_vals, slow_to_fast=True, dtype=loop_fit32) h5_loop_guess = copy_dataset(h5_loop_fit, loop_grp, alias='Guess') copy_all_region_refs(h5_loop_guess, h5_loop_fit) self.h5_raw = h5_raw_data self.h5_sho_guess = h5_sho_guess self.h5_sho_fit = h5_sho_fit self.h5_loop_guess = h5_loop_guess self.h5_loop_fit = h5_loop_fit self.h5_spec_vals = h5_raw_data.h5_spec_vals self.h5_spec_inds = h5_raw_data.h5_spec_inds self.h5_sho_spec_inds = h5_sho_fit.h5_spec_inds self.h5_sho_spec_vals = h5_sho_fit.h5_spec_vals self.h5_loop_spec_inds = h5_loop_fit.h5_spec_inds self.h5_loop_spec_vals = h5_loop_fit.h5_spec_vals self.h5_file = h5_raw_data.file return
def translate(self, file_path, show_plots=True, save_plots=True, do_histogram=False): """ Basic method that translates .dat data file(s) to a single .h5 file Inputs: file_path -- Absolute file path for one of the data files. It is assumed that this file is of the OLD data format. Outputs: Nothing """ file_path = path.abspath(file_path) (folder_path, basename) = path.split(file_path) (basename, path_dict) = self._parse_file_path(file_path) h5_path = path.join(folder_path, basename + '.h5') if path.exists(h5_path): remove(h5_path) self.h5_file = h5py.File(h5_path, 'w') isBEPS = True parm_dict = self.__getParmsFromOldMat(path_dict['old_mat_parms']) ignored_plt_grps = ['in-field' ] # Here we assume that there is no in-field. # If in-field data is captured then the translator would have to be modified. # Technically, we could do away with this if statement, as isBEPS is always true for this translation if isBEPS: parm_dict['data_type'] = 'BEPSData' std_expt = parm_dict[ 'VS_mode'] != 'load user defined VS Wave from file' if not std_expt: warn( 'This translator does not handle user defined voltage spectroscopy' ) return spec_label = getSpectroscopicParmLabel(parm_dict['VS_mode']) # Check file sizes: if 'read_real' in path_dict.keys(): real_size = path.getsize(path_dict['read_real']) imag_size = path.getsize(path_dict['read_imag']) else: real_size = path.getsize(path_dict['write_real']) imag_size = path.getsize(path_dict['write_imag']) if real_size != imag_size: raise ValueError( "Real and imaginary file sizes DON'T match!. Ending") num_rows = int(parm_dict['grid_num_rows']) num_cols = int(parm_dict['grid_num_cols']) num_pix = num_rows * num_cols tot_bins = real_size / ( num_pix * 4) # Finding bins by simple division of entire datasize # Check for case where only a single pixel is missing. check_bins = real_size / ((num_pix - 1) * 4) if tot_bins % 1 and check_bins % 1: warn('Aborting! Some parameter appears to have changed in-between') return elif not tot_bins % 1: # Everything's ok pass elif not check_bins % 1: tot_bins = check_bins warn( 'Warning: A pixel seems to be missing from the data. File will be padded with zeros.' ) tot_bins = int(tot_bins) (bin_inds, bin_freqs, bin_FFT, ex_wfm, dc_amp_vec) = self.__readOldMatBEvecs(path_dict['old_mat_parms']) """ Because this is the old data format and there is a discrepancy in the number of bins (they seem to be 2 less than the actual number), we need to re-calculate it based on the available data. This is done below. """ band_width = parm_dict['BE_band_width_[Hz]'] * ( 0.5 - parm_dict['BE_band_edge_trim']) st_f = parm_dict['BE_center_frequency_[Hz]'] - band_width en_f = parm_dict['BE_center_frequency_[Hz]'] + band_width bin_freqs = np.linspace(st_f, en_f, len(bin_inds), dtype=np.float32) # Forcing standardized datatypes: bin_inds = np.int32(bin_inds) bin_freqs = np.float32(bin_freqs) bin_FFT = np.complex64(bin_FFT) ex_wfm = np.float32(ex_wfm) self.FFT_BE_wave = bin_FFT (UDVS_labs, UDVS_units, UDVS_mat) = self.__buildUDVSTable(parm_dict) # Remove the unused plot group columns before proceeding: (UDVS_mat, UDVS_labs, UDVS_units) = trimUDVS(UDVS_mat, UDVS_labs, UDVS_units, ignored_plt_grps) spec_inds = np.zeros(shape=(2, tot_bins), dtype=INDICES_DTYPE) # Will assume that all excitation waveforms have same number of bins # Here, the denominator is 2 because only out of field measruements. For IF + OF, should be 1 num_actual_udvs_steps = UDVS_mat.shape[0] / 2 bins_per_step = tot_bins / num_actual_udvs_steps # Some more checks if bins_per_step % 1: warn('Non integer number of bins per step!') return else: bins_per_step = int(bins_per_step) num_actual_udvs_steps = int(num_actual_udvs_steps) stind = 0 for step_index in range(UDVS_mat.shape[0]): if UDVS_mat[step_index, 2] < 1E-3: # invalid AC amplitude continue # skip spec_inds[0, stind:stind + bins_per_step] = np.arange( bins_per_step, dtype=INDICES_DTYPE) # Bin step spec_inds[1, stind:stind + bins_per_step] = step_index * np.ones( bins_per_step, dtype=INDICES_DTYPE) # UDVS step stind += bins_per_step del stind, step_index # Some very basic information that can help the processing / analysis crew parm_dict['num_bins'] = tot_bins parm_dict['num_pix'] = num_pix parm_dict['num_udvs_steps'] = num_actual_udvs_steps global_parms = dict() global_parms['grid_size_x'] = parm_dict['grid_num_cols'] global_parms['grid_size_y'] = parm_dict['grid_num_rows'] global_parms['experiment_date'] = parm_dict['File_date_and_time'] # assuming that the experiment was completed: global_parms['current_position_x'] = parm_dict['grid_num_cols'] - 1 global_parms['current_position_y'] = parm_dict['grid_num_rows'] - 1 global_parms['data_type'] = parm_dict[ 'data_type'] # self.__class__.__name__ global_parms['translator'] = 'ODF' write_simple_attrs(self.h5_file, global_parms) # Create Measurement and Channel groups meas_grp = create_indexed_group(self.h5_file, 'Measurement') write_simple_attrs(meas_grp, parm_dict) chan_grp = create_indexed_group(meas_grp, 'Channel') chan_grp.attrs['Channel_Input'] = parm_dict['IO_Analog_Input_1'] # Create Auxilliary Datasets h5_ex_wfm = chan_grp.create_dataset('Excitation_Waveform', data=ex_wfm) udvs_slices = dict() for col_ind, col_name in enumerate(UDVS_labs): udvs_slices[col_name] = (slice(None), slice(col_ind, col_ind + 1)) h5_UDVS = chan_grp.create_dataset('UDVS', data=UDVS_mat, dtype=np.float32) write_simple_attrs(h5_UDVS, {'labels': UDVS_labs, 'units': UDVS_units}) h5_bin_steps = chan_grp.create_dataset('Bin_Steps', data=np.arange(bins_per_step, dtype=np.uint32), dtype=np.uint32) # Need to add the Bin Waveform type - infer from UDVS exec_bin_vec = self.signal_type * np.ones(len(bin_inds), dtype=np.int32) h5_wfm_typ = chan_grp.create_dataset('Bin_Wfm_Type', data=exec_bin_vec, dtype=np.int32) h5_bin_inds = chan_grp.create_dataset('Bin_Indices', data=bin_inds, dtype=np.uint32) h5_bin_freq = chan_grp.create_dataset('Bin_Frequencies', data=bin_freqs, dtype=np.float32) h5_bin_FFT = chan_grp.create_dataset('Bin_FFT', data=bin_FFT, dtype=np.complex64) # Noise floor should be of shape: (udvs_steps x 3 x positions) h5_noise_floor = chan_grp.create_dataset( 'Noise_Floor', shape=(num_pix, num_actual_udvs_steps), dtype=nf32, chunks=(1, num_actual_udvs_steps)) """ ONLY ALLOCATING SPACE FOR MAIN DATA HERE! Chunk by each UDVS step - this makes it easy / quick to: 1. read data for a single UDVS step from all pixels 2. read an entire / multiple pixels at a time The only problem is that a typical UDVS step containing 50 steps occupies only 400 bytes. This is smaller than the recommended chunk sizes of 10,000 - 999,999 bytes meaning that the metadata would be very substantial. This assumption is fine since we almost do not handle any user defined cases """ """ New Method for chunking the Main_Data dataset. Chunking is now done in N-by-N squares of UDVS steps by pixels. N is determined dinamically based on the dimensions of the dataset. Currently it is set such that individual chunks are less than 10kB in size. Chris Smith -- [email protected] """ pos_dims = [ Dimension('X', 'nm', num_cols), Dimension('Y', 'nm', num_rows) ] # Create Spectroscopic Values and Spectroscopic Values Labels datasets spec_vals, spec_inds, spec_vals_labs, spec_vals_units, spec_vals_names = createSpecVals( UDVS_mat, spec_inds, bin_freqs, exec_bin_vec, parm_dict, UDVS_labs, UDVS_units) spec_dims = list() for row_ind, row_name in enumerate(spec_vals_labs): spec_dims.append( Dimension(row_name, spec_vals_units[row_ind], spec_vals[row_ind])) pixel_chunking = maxReadPixels(10240, num_pix * num_actual_udvs_steps, bins_per_step, np.dtype('complex64').itemsize) chunking = np.floor(np.sqrt(pixel_chunking)) chunking = max(1, chunking) chunking = min(num_actual_udvs_steps, num_pix, chunking) self.h5_main = write_main_dataset(chan_grp, (num_pix, tot_bins), 'Raw_Data', 'Piezoresponse', 'V', pos_dims, spec_dims, dtype=np.complex64, chunks=(chunking, chunking * bins_per_step), compression='gzip') self.mean_resp = np.zeros(shape=(self.ds_main.shape[1]), dtype=np.complex64) self.max_resp = np.zeros(shape=(self.ds_main.shape[0]), dtype=np.float32) self.min_resp = np.zeros(shape=(self.ds_main.shape[0]), dtype=np.float32) # Now read the raw data files: self._read_data(path_dict['read_real'], path_dict['read_imag'], parm_dict) self.h5_file.flush() generatePlotGroups(self.ds_main, self.mean_resp, folder_path, basename, self.max_resp, self.min_resp, max_mem_mb=self.max_ram, spec_label=spec_label, show_plots=show_plots, save_plots=save_plots, do_histogram=do_histogram) self.h5_file.close() return h5_path
def translate(self, file_path, *args, **kwargs): """ Translates a given Bruker / Veeco / Nanoscope AFM derived file to HDF5. Currently handles scans, force curves, and force-distance maps Note that this translator was written with a single example file for each modality and may be buggy. Parameters ---------- file_path : str / unicode path to data file Returns ------- h5_path : str / unicode path to translated HDF5 file """ self.file_path = path.abspath(file_path) self.meta_data, other_parms = self._extract_metadata() # These files are weirdly named with extensions such as .001 h5_path = file_path.replace('.', '_') + '.h5' if path.exists(h5_path): remove(h5_path) h5_file = h5py.File(h5_path, 'w') type_suffixes = ['Image', 'Force_Curve', 'Force_Map'] # 0 - stack of scan images # 1 - single force curve # 2 - force map force_count = 0 image_count = 0 for class_name in self.meta_data.keys(): if 'Ciao force image list' in class_name: force_count += 1 elif 'Ciao image list' in class_name: image_count += 1 data_type = 0 if force_count > 0: if image_count > 0: data_type = 2 else: data_type = 1 global_parms = dict() global_parms['data_type'] = 'Bruker_AFM_' + type_suffixes[data_type] global_parms['translator'] = 'Bruker_AFM' write_simple_attrs(h5_file, global_parms) # too many parameters. Making a dummy group just for the parameters. h5_parms_grp = h5_file.create_group('Parameters') # We currently have a dictionary of dictionaries. This needs to be flattened flat_dict = dict() for class_name, sub_dict in other_parms.items(): for key, val in sub_dict.items(): flat_dict[class_name + '_' + key] = val write_simple_attrs(h5_parms_grp, flat_dict) # Create measurement group h5_meas_grp = create_indexed_group(h5_file, 'Measurement') # Call the data specific translation function trans_funcs = [ self._translate_image_stack, self._translate_force_curve, self._translate_force_map ] trans_funcs[data_type](h5_meas_grp) # wrap up and return path h5_file.close() return h5_path
def translate(self, file_path): """ The main function that translates the provided file into a .h5 file Parameters ---------- file_path : String / unicode Absolute path of any file in the directory Returns ------- h5_path : String / unicode Absolute path of the h5 file """ file_path = path.abspath(file_path) # Figure out the basename of the data: (basename, parm_paths, data_paths) = super(GTuneTranslator, self)._parse_file_path(file_path) (folder_path, unused) = path.split(file_path) h5_path = path.join(folder_path, basename + '.h5') if path.exists(h5_path): remove(h5_path) # Load parameters from .mat file matread = loadmat(parm_paths['parm_mat'], variable_names=['AI_wave', 'BE_wave_AO_0', 'BE_wave_AO_1', 'BE_wave_train', 'BE_wave', 'total_cols', 'total_rows']) be_wave = np.float32(np.squeeze(matread['BE_wave'])) be_wave_train = np.float32(np.squeeze(matread['BE_wave_train'])) num_cols = int(matread['total_cols'][0][0]) expected_rows = int(matread['total_rows'][0][0]) self.points_per_pixel = len(be_wave) self.points_per_line = len(be_wave_train) # Load parameters from .txt file - 'BE_center_frequency_[Hz]', 'IO rate' is_beps, parm_dict = parmsToDict(parm_paths['parm_txt']) # Get file byte size: # For now, assume that bigtime_00 always exists and is the main file file_size = path.getsize(data_paths[0]) # Calculate actual number of lines since the first few lines may not be saved self.num_rows = 1.0 * file_size / (4 * self.points_per_pixel * num_cols) if self.num_rows % 1: warn('Error - File has incomplete rows') return None else: self.num_rows = int(self.num_rows) samp_rate = parm_dict['IO_rate_[Hz]'] ex_freq_nominal = parm_dict['BE_center_frequency_[Hz]'] # method 1 for calculating the correct excitation frequency: pixel_duration = 1.0 * self.points_per_pixel / samp_rate num_periods = pixel_duration * ex_freq_nominal ex_freq_correct = 1 / (pixel_duration / np.floor(num_periods)) # correcting the excitation frequency - will be VERY useful during analysis and filtering parm_dict['BE_center_frequency_[Hz]'] = ex_freq_correct # Some very basic information that can help the processing crew parm_dict['points_per_line'] = self.points_per_line parm_dict['num_bins'] = self.points_per_pixel parm_dict['grid_num_rows'] = self.num_rows parm_dict['data_type'] = 'G_mode_line' if self.num_rows != expected_rows: print('Note: {} of {} lines found in data file'.format(self.num_rows, expected_rows)) # Calculate number of points to read per line: self.__bytes_per_row__ = int(file_size / self.num_rows) # First finish writing all global parameters, create the file too: h5_file = h5py.File(h5_path, 'w') global_parms = dict() global_parms['data_type'] = 'G_mode_line' global_parms['translator'] = 'G_mode_line' write_simple_attrs(h5_file, global_parms) # Next create the Measurement and Channel groups and write the appropriate parameters to them meas_grp = create_indexed_group(h5_file, 'Measurement') write_simple_attrs(meas_grp, parm_dict) # Now that the file has been created, go over each raw data file: """ We only allocate the space for the main data here. This does NOT change with each file. The data written to it does. The auxiliary datasets will not change with each raw data file since only one excitation waveform is used """ pos_desc = Dimension('Y', 'm', np.arange(self.num_rows)) spec_desc = Dimension('Excitation', 'V', np.tile(VALUES_DTYPE(be_wave), num_cols)) h5_pos_ind, h5_pos_val = write_ind_val_dsets(meas_grp, pos_desc, is_spectral=False) h5_spec_inds, h5_spec_vals = write_ind_val_dsets(meas_grp, spec_desc, is_spectral=True) for f_index in data_paths.keys(): chan_grp = create_indexed_group(meas_grp, 'Channel') h5_main = write_main_dataset(chan_grp, (self.num_rows, self.points_per_pixel * num_cols), 'Raw_Data', 'Deflection', 'V', None, None, h5_pos_inds=h5_pos_ind, h5_pos_vals=h5_pos_val, h5_spec_inds=h5_spec_inds, h5_spec_vals=h5_spec_vals, chunks=(1, self.points_per_pixel), dtype=np.float16) # Now transfer scan data in the dat file to the h5 file: super(GTuneTranslator, self)._read_data(data_paths[f_index], h5_main) h5_file.close() print('G-Tune translation complete!') return h5_path
def translate(self, h5_path, force_patch=False, **kwargs): """ Add the needed references and attributes to the h5 file that are not created by the LabView data aquisition program. Parameters ---------- h5_path : str path to the h5 file force_patch : bool, optional Should the check to see if the file has already been patched be ignored. Default False. Returns ------- h5_file : str path to the patched dataset """ #TODO: Need a way to choose which channels to apply the patcher to, #fails for multi-channel files where not all files are capable of being main datasets # Open the file and check if a patch is needed h5_file = h5py.File(os.path.abspath(h5_path), 'r+') if h5_file.attrs.get('translator') is not None and not force_patch: print('File is already Pycroscopy ready.') h5_file.close() return h5_path ''' Get the list of all Raw_Data Datasets Loop over the list and update the needed attributes ''' raw_list = find_dataset(h5_file, 'Raw_Data') for h5_raw in raw_list: if 'quantity' not in h5_raw.attrs: h5_raw.attrs['quantity'] = 'quantity' if 'units' not in h5_raw.attrs: h5_raw.attrs['units'] = 'a.u.' # Grab the channel and measurement group of the data to check some needed attributes h5_chan = h5_raw.parent try: c_type = get_attr(h5_chan, 'channel_type') except KeyError: warn_str = "'channel_type' was not found as an attribute of {}.\n".format( h5_chan.name) warn_str += "If this is BEPS or BELine data from the LabView aquisition software, " + \ "please run the following piece of code. Afterwards, run this function again.\n" + \ "CODE: " \ "hdf.file['{}'].attrs['channel_type'] = 'BE'".format(h5_chan.name) warn(warn_str) h5_file.close() return h5_path except: raise if c_type != 'BE': continue h5_meas = h5_chan.parent h5_meas.attrs['num_UDVS_steps'] = h5_meas.attrs['num_steps'] # Get the object handles for the Indices and Values datasets h5_pos_inds = h5_chan['Position_Indices'] h5_pos_vals = h5_chan['Position_Values'] h5_spec_inds = h5_chan['Spectroscopic_Indices'] h5_spec_vals = h5_chan['Spectroscopic_Values'] # Make sure we have correct spectroscopic indices for the given values ds_spec_inds = create_spec_inds_from_vals(h5_spec_vals[()]) if not np.allclose(ds_spec_inds, h5_spec_inds[()]): h5_spec_inds[:, :] = ds_spec_inds[:, :] h5_file.flush() # Get the labels and units for the Spectroscopic datasets h5_spec_labels = h5_spec_inds.attrs['labels'] inds_and_vals = [ h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals ] for dset in inds_and_vals: spec_labels = dset.attrs['labels'] try: spec_units = dset.attrs['units'] if len(spec_units) != len(spec_labels): raise KeyError except KeyError: dset['units'] = ['' for _ in spec_labels] except: raise """" In early versions, too many spectroscopic dimension labels and units were listed compared to the number of rows. Remove here: """ remove_non_exist_spec_dim_labs(h5_spec_inds, h5_spec_vals, h5_meas, verbose=False) """ Add back some standard metadata to be consistent with older BE data """ missing_metadata = dict() if 'File_file_name' not in h5_meas.attrs.keys(): missing_metadata['File_file_name'] = os.path.split( h5_raw.file.filename)[-1].replace('.h5', '') if 'File_date_and_time' not in h5_meas.attrs.keys(): try: date_str = get_attr(h5_raw.file, 'date_string') time_str = get_attr(h5_raw.file, 'time_string') full_str = date_str.strip() + ' ' + time_str.strip() """ convert: date_string : 2018-12-05 time_string : 3:41:45 PM to: File_date_and_time: 19-Jun-2009 18:44:56 """ try: dt_obj = datetime.datetime.strptime( full_str, "%Y-%m-%d %I:%M:%S %p") missing_metadata[ 'File_date_and_time'] = dt_obj.strftime( '%d-%b-%Y %H:%M:%S') except ValueError: pass except KeyError: pass # Now write to measurement group: if len(missing_metadata) > 0: write_simple_attrs(h5_meas, missing_metadata) # Link the references to the Indices and Values datasets to the Raw_Data print(h5_raw.shape, h5_pos_vals.shape, h5_spec_vals.shape) print(h5_spec_inds.shape, h5_pos_inds.shape) link_as_main(h5_raw, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals) # Also link the Bin_Frequencies and Bin_Wfm_Type datasets h5_freqs = h5_chan['Bin_Frequencies'] aux_dset_names = ['Bin_Frequencies'] aux_dset_refs = [h5_freqs.ref] check_and_link_ancillary(h5_raw, aux_dset_names, anc_refs=aux_dset_refs) ''' Get all SHO_Fit groups for the Raw_Data and loop over them Get the Guess and Spectroscopic Datasets for each SHO_Fit group ''' sho_list = find_results_groups(h5_raw, 'SHO_Fit') for h5_sho in sho_list: h5_sho_guess = h5_sho['Guess'] h5_sho_spec_inds = h5_sho['Spectroscopic_Indices'] h5_sho_spec_vals = h5_sho['Spectroscopic_Values'] # Make sure we have correct spectroscopic indices for the given values ds_sho_spec_inds = create_spec_inds_from_vals( h5_sho_spec_inds[()]) if not np.allclose(ds_sho_spec_inds, h5_sho_spec_inds[()]): h5_sho_spec_inds[:, :] = ds_sho_spec_inds[:, :] # Get the labels and units for the Spectroscopic datasets h5_sho_spec_labels = get_attr(h5_sho_spec_inds, 'labels') h5_sho_spec_units = get_attr(h5_sho_spec_vals, 'units') if h5_sho_spec_inds.shape[-1] != h5_sho_guess.shape[-1]: print( 'Warning! Found incorrect spectral dimension for dataset {}. Attempting a fix.' .format(h5_sho_guess)) try: h5_sho_spec_inds = h5_sho_guess.parent.create_dataset( "h5_sho_spec_inds_fixed", shape=(1, 1), dtype='uint32') h5_sho_spec_inds.attrs['labels'] = 'labels' h5_sho_spec_inds.attrs['units'] = 'units' except RuntimeError: print( "It seems that the file has already been patched." " Will use previously computed ancilliary datasets" ) h5_sho_spec_inds = h5_sho_guess.parent[ 'h5_sho_spec_inds_fixed'] try: h5_sho_spec_vals = h5_sho_guess.parent.create_dataset( "h5_sho_spec_vals_fixed", shape=(1, 1), dtype='uint32') h5_sho_spec_vals[:] = 0 h5_sho_spec_vals.attrs['labels'] = 'labels' h5_sho_spec_vals.attrs['units'] = 'units' except RuntimeError: print( "It seems that the file has already been patched." " Will use previously computed ancilliary datasets" ) h5_sho_spec_vals = h5_sho_guess.parent[ 'h5_sho_spec_vals_fixed2'] link_as_main(h5_sho_guess, h5_pos_inds, h5_pos_vals, h5_sho_spec_inds, h5_sho_spec_vals) sho_inds_and_vals = [h5_sho_spec_inds, h5_sho_spec_vals] for dset in sho_inds_and_vals: spec_labels = get_attr(dset, 'labels') try: spec_units = get_attr(dset, 'units') if len(spec_units) != len(spec_labels): raise KeyError except KeyError: spec_units = [''.encode('utf-8') for _ in spec_labels] dset.attrs['units'] = spec_units except: raise h5_file.flush() h5_file.attrs['translator'] = 'V3patcher'.encode('utf-8') h5_file.close() return h5_path
def translate(self, parm_path): """ The main function that translates the provided file into a .h5 file Parameters ------------ parm_path : string / unicode Absolute file path of the parameters .mat file. Returns ---------- h5_path : string / unicode Absolute path of the translated h5 file """ parm_path = path.abspath(parm_path) parm_dict, excit_wfm = self._read_parms(parm_path) folder_path, base_name = path.split(parm_path) waste, base_name = path.split(folder_path) # Until a better method is provided.... with h5py.File(path.join(folder_path, 'line_1.mat'), 'r') as h5_mat_line_1: num_ai_chans = h5_mat_line_1['data'].shape[1] h5_path = path.join(folder_path, base_name + '.h5') if path.exists(h5_path): remove(h5_path) with h5py.File(h5_path) as h5_f: h5_meas_grp = create_indexed_group(h5_f, 'Measurement') global_parms = dict() global_parms.update({'data_type': 'gIV', 'translator': 'gIV'}) write_simple_attrs(h5_meas_grp, global_parms) # Only prepare the instructions for the dimensions here spec_dims = Dimension('Bias', 'V', excit_wfm) pos_dims = Dimension( 'Y', 'm', np.linspace(0, parm_dict['grid_scan_height_[m]'], parm_dict['grid_num_rows'])) self.raw_datasets = list() for chan_index in range(num_ai_chans): h5_chan_grp = create_indexed_group(h5_meas_grp, 'Channel') write_simple_attrs(h5_chan_grp, parm_dict) """ Minimize file size to the extent possible. DAQs are rated at 16 bit so float16 should be most appropriate. For some reason, compression is effective only on time series data """ h5_raw = write_main_dataset( h5_chan_grp, (parm_dict['grid_num_rows'], excit_wfm.size), 'Raw_Data', 'Current', '1E-{} A'.format(parm_dict['IO_amplifier_gain']), pos_dims, spec_dims, dtype=np.float16, chunks=(1, excit_wfm.size), compression='gzip') self.raw_datasets.append(h5_raw) # Now that the N channels have been made, populate them with the actual data.... self._read_data(parm_dict, folder_path) return h5_path
def _create_results_datasets(self): """ Creates all the datasets necessary for holding all parameters + data. """ self.h5_results_grp = create_results_group( self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) self.parms_dict.update({ 'last_pixel': 0, 'algorithm': 'pycroscopy_SignalFilter' }) write_simple_attrs(self.h5_results_grp, self.parms_dict) assert isinstance(self.h5_results_grp, h5py.Group) if isinstance(self.composite_filter, np.ndarray): h5_comp_filt = self.h5_results_grp.create_dataset( 'Composite_Filter', data=np.float32(self.composite_filter)) if self.verbose and self.mpi_rank == 0: print( 'Rank {} - Finished creating the Composite_Filter dataset'. format(self.mpi_rank)) # First create the position datsets if the new indices are smaller... if self.num_effective_pix != self.h5_main.shape[0]: # TODO: Do this part correctly. See past solution: """ # need to make new position datasets by taking every n'th index / value: new_pos_vals = np.atleast_2d(h5_pos_vals[slice(0, None, self.num_effective_pix), :]) pos_descriptor = [] for name, units, leng in zip(h5_pos_inds.attrs['labels'], h5_pos_inds.attrs['units'], [int(np.unique(h5_pos_inds[:, dim_ind]).size / self.num_effective_pix) for dim_ind in range(h5_pos_inds.shape[1])]): pos_descriptor.append(Dimension(name, units, np.arange(leng))) ds_pos_inds, ds_pos_vals = build_ind_val_dsets(pos_descriptor, is_spectral=False, verbose=self.verbose) h5_pos_vals.data = np.atleast_2d(new_pos_vals) # The data generated above varies linearly. Override. """ h5_pos_inds_new, h5_pos_vals_new = write_ind_val_dsets( self.h5_results_grp, Dimension('pixel', 'a.u.', self.num_effective_pix), is_spectral=False, verbose=self.verbose and self.mpi_rank == 0) if self.verbose and self.mpi_rank == 0: print('Rank {} - Created the new position ancillary dataset'. format(self.mpi_rank)) else: h5_pos_inds_new = self.h5_main.h5_pos_inds h5_pos_vals_new = self.h5_main.h5_pos_vals if self.verbose and self.mpi_rank == 0: print('Rank {} - Reusing source datasets position datasets'. format(self.mpi_rank)) if self.noise_threshold is not None: self.h5_noise_floors = write_main_dataset( self.h5_results_grp, (self.num_effective_pix, 1), 'Noise_Floors', 'Noise', 'a.u.', None, Dimension('arb', '', [1]), dtype=np.float32, aux_spec_prefix='Noise_Spec_', h5_pos_inds=h5_pos_inds_new, h5_pos_vals=h5_pos_vals_new, verbose=self.verbose and self.mpi_rank == 0) if self.verbose and self.mpi_rank == 0: print('Rank {} - Finished creating the Noise_Floors dataset'. format(self.mpi_rank)) if self.write_filtered: # Filtered data is identical to Main_Data in every way - just a duplicate self.h5_filtered = create_empty_dataset( self.h5_main, self.h5_main.dtype, 'Filtered_Data', h5_group=self.h5_results_grp) if self.verbose and self.mpi_rank == 0: print( 'Rank {} - Finished creating the Filtered dataset'.format( self.mpi_rank)) self.hot_inds = None if self.write_condensed: self.hot_inds = np.where(self.composite_filter > 0)[0] self.hot_inds = np.uint(self.hot_inds[int(0.5 * len(self.hot_inds)):] ) # only need to keep half the data condensed_spec = Dimension('hot_frequencies', '', int(0.5 * len(self.hot_inds))) self.h5_condensed = write_main_dataset( self.h5_results_grp, (self.num_effective_pix, len(self.hot_inds)), 'Condensed_Data', 'Complex', 'a. u.', None, condensed_spec, h5_pos_inds=h5_pos_inds_new, h5_pos_vals=h5_pos_vals_new, dtype=np.complex, verbose=self.verbose and self.mpi_rank == 0) if self.verbose and self.mpi_rank == 0: print( 'Rank {} - Finished creating the Condensed dataset'.format( self.mpi_rank)) if self.mpi_size > 1: self.mpi_comm.Barrier() self.h5_main.file.flush()