def create_group_in_hdf5_file(group, hdf5_file): """ Create group in HDF5 file Parameters ---------- group : string name of the group that should be created hdf5_file : string (optional) location of the hdf5 file. If not given, then we read it from the function get_hdf5_file """ try: # store in HDF5: make sure you are in append mode 'a', when in write mode 'w', the file will be recreated with h5py.File(hdf5_file, 'a') as hf: # create group as subject name hf.create_group(group) logging.info('Succesfully created group {}'.format(group)) except IOError: # random sleep between seconds sleep = get_random_number_between(1, 5) # logging.warning('HDF5 file currently open, sleeping {} seconds'.format(sleep)) time.sleep(sleep) # call function again because hdf5 file does not allow for multiple write sessions return create_group_in_hdf5_file(group, hdf5_file) except Exception as e: logging.warning('Error creating group in HDF5 file: {}'.format(e)) exit(1)
def delete_dataset_from_group(group_name, dataset, hdf5_file): """ Delete a dataset from a group within the HDF5 file Parameters ---------- group_name : string the name of the group dataset : string name of the dataset that needs to be deleted hdf5_file : string (optional) path of the HDF5 file. If not given, then read from get_hdf5_file function """ # check if file exists if os.path.exists(hdf5_file): try: with h5py.File(hdf5_file, 'a') as hf: # check if subject has its own group if group_name in list(hf.keys()): if dataset in list(hf[group_name].keys()): # remove the group del hf[group_name][dataset] else: logging.warning( 'Dataset {} not present in group {}'.format( dataset, group_name)) return None else: logging.error( 'Group {} not present in HDF5 file: {}'.format( group_name, hdf5_file)) return None except IOError: # random sleep between seconds sleep = get_random_number_between(1, 3) # logging.warning('HDF5 file currently open, sleeping {} seconds'.format(sleep)) time.sleep(sleep) # call function again because hdf5 file does not allow for multiple write sessions return delete_dataset_from_group(group_name, dataset, hdf5_file) except Exception as e: logging.error('Error deleting group {}: {}'.format(group_name, e)) exit(1) else: logging.error('HDF5 file does not exist: {}'.format(hdf5_file)) exit(1)
def delete_group(group_name, hdf5_file=None): """ Delete group from HDF5 file Parameters ---------- group_name : string the name of the group to be deleted hdf5_file : string (optional) path of the HDF5 file. If not given, then read from get_hdf5_file function """ # if the hdf5 file is not given, then we read it from the function get_hdf5_file if hdf5_file is None: hdf5_file = get_hdf5_file() # check if file exists if os.path.exists(hdf5_file): try: with h5py.File(hdf5_file, 'a') as hf: # check if subject has its own group if group_name in hf.keys(): # remove the group del hf[group_name] else: logging.error( 'Group {} not present in HDF5 file: {}'.format( group_name, hdf5_file)) return None except IOError: # random sleep between seconds sleep = get_random_number_between(1, 5) # logging.warning('HDF5 file currently open, sleeping {} seconds'.format(sleep)) time.sleep(sleep) # call function again because hdf5 file does not allow for multiple write sessions return delete_group(group_name=group_name, hdf5_file=hdf5_file) except Exception as e: logging.error('Error deleting group {}: {}'.format(group_name, e)) exit(1) else: logging.error('HDF5 file does not exist: {}'.format(hdf5_file)) exit(1)
def save_meta_data_to_group_dataset(group_name, dataset, meta_data, hdf5_file): """ Save meta data to group attributes (note that here we save a dictionary on the group level, thus not on the dataset level) Parameters ----------- group_name : string HDF5 group name dataset : string name of the dataset (this is a dataset within a group) metadata : dictionary dictionary of meta-data hdf5_file : string (optional) path of the HDF5 file. If not given, then read from get_hdf5_file function """ try: # check if file exists if os.path.exists(hdf5_file): with h5py.File(hdf5_file, 'a') as hf: # check if subject has its own group if group_name in hf.keys(): # define the group grp = hf[group_name] # get the dataset from the group if dataset in grp.keys(): # check if meta_data is a dictionary if isinstance(meta_data, dict): # add meta data to dataset for key, value in meta_data.items(): # add the key value pair to the group attributes grp[dataset].attrs[key] = value logging.info( 'Meta data saved to group: {} and dataset: {}'. format(group_name, dataset)) else: logging.error( 'Meta data is not of type dictionary. Received {} instead' .format(type(meta_data))) return None else: logging.error('Dataset {} not part of group {}'.format( dataset, group_name)) return None else: logging.error( 'Group {} not present in HDF5 file: {}'.format( group_name, hdf5_file)) return None else: logging.error('HDF5 file does not exist: {}'.format(hdf5_file)) return None except IOError: # random sleep between seconds sleep = get_random_number_between(1, 5) # logging.warning('HDF5 file currently open, sleeping {} seconds'.format(sleep)) time.sleep(sleep) # call function again because hdf5 file does not allow for multiple write sessions return save_meta_data_to_group_dataset(group_name, dataset, meta_data, hdf5_file) except Exception as e: logging.error( 'Error saving meta data to group {} and dataset {}: {}'.format( group_name, dataset, e)) exit()
def read_metadata_from_group_dataset(group_name, dataset, hdf5_file): """ Read metadata from a dataset from group of HDF5 file Parameters ----------- group_name : string HDF5 group name dataset : string HDF5 dataset name hdf5_file : string (optional) path of the HDF5 file. If not given, then read from get_hdf5_file function Returns ---------- metadata : dictionary dictionary of meta-data """ # check if file exists if os.path.exists(hdf5_file): try: with h5py.File(hdf5_file, 'r') as hf: # check if subject has its own group if group_name in hf.keys(): # get the group group = hf[group_name] # get the dataset from the group if dataset in group.keys(): # get the data return dict(group[dataset].attrs) else: logging.error('Dataset {} not part of group {}'.format( dataset, group_name)) return None else: logging.error( 'Group {} not present in HDF5 file: {}'.format( group_name, hdf5_file)) return None except IOError: # random sleep between seconds sleep = get_random_number_between(1, 2) # logging.warning('HDF5 file currently open, sleeping {} seconds'.format(sleep)) time.sleep(sleep) # call function again because hdf5 file does not allow for multiple write sessions return read_metadata_from_group_dataset(group_name, dataset, hdf5_file) else: logging.error('HDF5 file does not exist: {}'.format(hdf5_file)) exit(1)
def save_multi_data_to_group_hdf5(group, data, data_name, hdf5_file, meta_data=None, overwrite=False, create_group_if_not_exists=True): """ Save list of data as a dataset in a group (this functions has the same functionality as save_data_to_group_hdf5 but it allows for list of data to be inserted) Parameters --------- group : string The name of the group where the data needs to be stored. Can be subject name for example data : numpy.array Data that needs to be stored. data_name : string Name of the dataset. This is basically the key within the group meta_data : dictionary (optional) meta data to save with the group, should be of type dictionary. For instance, header information or subject information overwrite : Boolean (optional) If set to True then we overwrite the current dataset if present create_group_if_not_exists = Boolean (optional) create group in hdf5 file if not exists hdf5_file : string (optional) location of the hdf5 file. If not given, then we read it from the function get_hdf5_file """ # checks to see if lists are of equal length if len(data) != len(data_name): logging.error('Size of data and data_name are not the same.') exit(1) try: # store in HDF5: make sure you are in append mode 'a', when in write mode 'w', the file will be recreated with h5py.File(hdf5_file, 'a') as hf: # check if group exists group_exists = True if hf.get(group) is not None else False # check if group needs to be created if not exist if create_group_if_not_exists: # only create group if not exist already if not group_exists: # create group create_group_in_hdf5_file(group=group, hdf5_file=hdf5_file) else: # don't create group if not exist but if group not exists, issue warning, because otherwise we can't add data to the group if not group_exists: logging.warning( 'Could not add data {} to group because group {} does not exist. Continue with setting the create_group_if_not_exists parameter to True.' .format(data_name, group)) exit(1) # define group as variable grp = hf[group] # check if overwrite is set to true. If so, then we need to delete the dataset first if overwrite: # delete datasets for i in range(0, len(data)): if grp.get(data_name[i]) is not None: del grp[data_name[i]] # check if dataset already exists in group for i in range(0, len(data)): if data_name[i] not in grp.keys() or overwrite == True: # store data in group grp.create_dataset(data_name[i], data=data[i]) logging.info('Dataset {} saved in group {}'.format( data_name[i], group)) # store meta data if present if meta_data is not None: # check if meta_data is a dictionary if meta_data[i] is not None and isinstance( meta_data[i], dict): # add meta data to dataset for key, value in meta_data[i].items(): grp[data_name[i]].attrs[key] = value logging.info( 'Meta data saved to group: {} with name {}'. format(group, data_name[i])) else: logging.warning( 'Meta data is not of type dictionary. Received {} instead. Skiping..' .format(type(meta_data[i]))) else: logging.warning( 'Dataset {} already exists in group {}. Consider setting overwrite = True if you want to overwrite the data.' .format(data_name[i], group)) except IOError: # random sleep between seconds sleep = get_random_number_between(1, 2) logging.warning( 'HDF5 file currently open, sleeping {} seconds'.format(sleep)) time.sleep(sleep) # call function again because hdf5 file does not allow for multiple write sessions return save_multi_data_to_group_hdf5(group, data, data_name, meta_data, overwrite, create_group_if_not_exists, hdf5_file) except Exception as e: logging.error('Error saving datasets to group {}: {}'.format(group, e)) exit()
def read_dataset_from_group(group_name, dataset, hdf5_file, start_slice=None, end_slice=None, stride=1, verbose=False): """ Read dataset from group of HDF5 file Parameters ----------- group_name : string HDF5 group name dataset : string HDF5 dataset name hdf5_file : string (optional) path of the HDF5 file. If not given, then read from get_hdf5_file function start_slice : int (optional) start slice of the data end_slice : int (optional) end slice of the data stride : int (optional) stride of the data (i.e. skipping rows) verbose : boolean (optional) if set to True, print out information Returns ---------- data : np.array numpy array of the dataset """ if verbose: logging.debug('Reading group: {}, dataset: {}, hdf5_file: {}'.format( group_name, dataset, hdf5_file)) try: # check if file exists if os.path.exists(hdf5_file): with h5py.File(hdf5_file, 'r') as hf: # check if subject has its own group if group_name in list(hf.keys()): # get the group group = hf[group_name] # get the dataset from the group if dataset in group.keys(): # get the data return group[dataset][start_slice:end_slice:stride] else: logging.error('Dataset {} not part of group {}'.format( dataset, group_name)) return None else: logging.error( 'Group {} not present in HDF5 file: {}'.format( group_name, hdf5_file)) return None else: logging.error('HDF5 file does not exist: {}'.format(hdf5_file)) exit(1) except IOError: # random sleep between seconds sleep = get_random_number_between(1, 5) # logging.warning('HDF5 file currently open, sleeping {} seconds'.format(sleep)) time.sleep(sleep) # call function again because hdf5 file does not allow for multiple write sessions return read_dataset_from_group(group_name, dataset, hdf5_file, start_slice, end_slice, stride) except Exception as e: logging.error('Error reading dataset {} from group {}: {}'.format( dataset, group_name, e)) exit()
def get_all_subjects_hdf5(hdf5_file, filter_on=None): """ Get all the subjects from the hdf5 file where the raw data is stored These are the keys of the groups, as we created a group for each subject to separate the data Parameters ---------- hdf5_file : string (optional) location of the hdf5 file. If not given, then we read it from the function get_hdf5_file filter_on : string (optional) name of the dataset to filter on. Thus returns groups that contain the filter_on dataset. Default is None, so all groups are returned within the HDF5 file Returns -------- hf.keys() : list list of group keys, which are the names of the subjects extracted from the gtx3 files """ # check if file exists try: if os.path.exists(hdf5_file): # open the hdf5 file with h5py.File(hdf5_file, 'r') as hf: # check if filter_on contains a value, if so, we want return only subjects (i.e. groups) that contain a certain dataset if filter_on is None: # return the keys of the HDF5 file. Here keys are subject IDs # note that we can't return hf.keys since they are view like objects, that's why we have to convert to list first return list(hf.keys()) else: # create empty list subjects = [] # check for each subject if filter_on dataset exists for subject in hf.keys(): # check if filter_on dataset is part of the group keys if filter_on in hf[subject].keys(): # subject contains the filter_on dataset, append to list so we can return it later subjects.append(subject) return subjects else: logging.warning('HDF5 file does not exist: {}'.format(hdf5_file)) # return empty list return [] except IOError: logging.warning( 'Could not read HDF5 file, possibly already open, retrying') # random sleep between seconds sleep = get_random_number_between(1, 5) time.sleep(sleep) # call function again return get_all_subjects_hdf5(hdf5_file) except Exception as e: logging.warning('Error reading subjects from HDF5 file: {}'.format(e)) exit(1)