def get_dataset_names(dbfilepath, dbroot='', dataset_names=[], pathinh5=[]): """ Recursively exctracts dataset names from hdf5 database """ if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file = h5py.File(dbfilepath, 'r') item = h5file isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): item = dbfilepath isdbfile = 0 else: return dataset_names for key, val in iteritem(dict(item)): #printlog(key, val) try: subitem = dict(val) if ('mz' in subitem) or ('sp' in subitem) or ( 'sp_unfiltered_peaks' in subitem) or ( ('is_sample_dataset' in subitem.attrs) and (subitem.attrs['is_sample_dataset'] == True)): success = 1 else: success = 0 except Exception as inst: #printlog(inst) #traceback.print_exc() success = 0 if success == 1: if is_string(pathinh5): success = 0 h5str = val.name.split('/')[0:2] for i in h5str: if '/' + i == pathinh5: datasetname = re.sub(pathinh5, '', val.name) dataset_names.append(datasetname) success = 1 break else: dataset_names.append(val.name) if success == 0: if isinstance(val, h5py.Group): dbroot = dbroot + val.name dataset_names = get_dataset_names(val, dbroot, dataset_names, pathinh5=pathinh5) if isdbfile == 1: h5file.close() return sorted(dataset_names)
def load_dataset(dbfilepath, pathinh5): pathinh5 = re.sub('//', '/', pathinh5) dataset = [] if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file_group = h5py.File(dbfilepath, 'a') isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): h5file_group = dbfilepath isdbfile = 0 else: return dataset try: isdata = pathinh5 in h5file_group except Exception as inst: printlog(inst) traceback.print_exc() return dataset if isdata == True: dataset = h5file_group[pathinh5][()] if isdbfile == 1: h5file_group.close() return dataset
def save_dataset(dbfilepath, pathinh5, data, chunksize='', compression_opts=''): pathinh5 = re.sub('//', '/', pathinh5) if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file_group = h5py.File(dbfilepath, 'a') isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): h5file_group = dbfilepath isdbfile = 0 else: return try: isdata = pathinh5 in h5file_group except Exception as inst: printlog(inst) traceback.print_exc() return if isdata: fdata = h5file_group[pathinh5] if (fdata.shape == data.shape) and (fdata.dtype == data.dtype): fdata[...] = data return else: printlog('Deleting original') del h5file_group[pathinh5] if (not chunksize) and (not compression_opts): h5file_group.create_dataset(pathinh5, data=data) elif (chunksize) and (compression_opts): h5file_group.create_dataset(pathinh5, data=data, chunks=chunksize, compression="gzip", compression_opts=compression_opts) elif (chunksize): h5file_group.create_dataset(pathinh5, data=data, chunks=chunksize) elif (compression_opts): h5file_group.create_dataset(pathinh5, data=data, chunks=True, compression="gzip", compression_opts=compression_opts) if isdbfile == 1: h5file_group.close() return
def load_preproc_obj(dbfilepath, procid, pathinh5=''): """ **Loads the pre-processing parameters of a module from the hdf5 database.** Args: dbfilepath: the name and path to the hdf5-database file procid: the module identifier pathinh5: the path in the hdf5 file for object storage """ h5objpath = pathinh5 + procid h5objpath = re.sub('//', '/', h5objpath) ProcObj = {} if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file_group = h5py.File(dbfilepath, 'a') isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): h5file_group = dbfilepath isdbfile = 0 else: return ProcObj try: isobj = h5objpath in h5file_group except Exception as inst: printlog(inst) traceback.print_exc() return ProcObj if isobj == False: return ProcObj # check whether this object is part of the preprocessing workflow h5obj = h5file_group[h5objpath] for i_name in h5obj.keys(): if isinstance(h5obj[i_name], h5py.Group): h5subobj = h5obj[i_name] subProcObj = {} for j_name in h5subobj.keys(): subProcObj[j_name] = load_dataset(h5subobj, j_name) ProcObj[i_name] = subProcObj else: ProcObj[i_name] = load_dataset(h5obj, i_name) if isdbfile == 1: h5file_group.close() return ProcObj
def get_traindata_names(dbfilepath, dbroot='', dataset_names=[], istrain=1): """ Recursively exctracts dataset names from hdf5 database """ if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file = h5py.File(dbfilepath, 'r') item = h5file isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): item = dbfilepath isdbfile = 0 else: return dataset_names for key, val in iteritem(dict(item)): try: subitem = dict(val) if ('istrain' in subitem) and ('Sp' in subitem): if load_dataset(item, val.name + '/istrain') == istrain: success = 1 else: success = 0 else: success = 0 except Exception as inst: printlog(inst) traceback.print_exc() success = 0 if success == 1: dataset_names.append(val.name) elif isinstance(val, h5py.Group): dbroot = dbroot + val.name dataset_names = get_traindata_names(val, dbroot, dataset_names, istrain) if isdbfile == 1: h5file.close() return dataset_names
def h5pathfinder(datapath): """ Finds a suitable path in the database file for storage of workflow metadata """ if is_string(datapath): splitpath = datapath.split('/') else: h5inpath ='' return h5inpath nsplits = len(splitpath) h5inpath = '' if nsplits == 2: if splitpath[0]!='': h5inpath = splitpath[0] + '/' elif nsplits > 2: for i in range(nsplits-1): h5inpath = h5inpath + splitpath[i] +'/' return h5inpath
def print_structure_h5db(dbfilepath, dbroot='', offset=' '): """Prints the HDF5 database structure""" if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file = h5py.File(dbfilepath, 'r') item = h5file isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): item = dbfilepath isdbfile = 0 else: return if isinstance(item, h5py.File): printlog(item.file, '(File)', item.name) elif isinstance(item, h5py.Dataset): printlog('(Dataset)', item.name, ' len =', item.shape) #, g.dtype elif isinstance(item, h5py.Group): printlog('(Group)', item.name) else: printlog('Warning: The item type is unkown', item.name) sys.exit("execution is terminated") if isinstance(item, h5py.File) or isinstance(item, h5py.Group): for key, val in dict(item).iteritems(): subitem = val printlog(offset, key) #," ", subg.name #, val, subg.len(), type(subg), dbroot = dbroot + 'i' print_structure_h5db(subitem, dbroot=dbroot, offset=' ') if isdbfile == 1: h5file.close()
def save_preproc_obj(dbfilepath, ProcObj, pathinh5=''): """ **Saves the pre-processing parameters of a module into the hdf5 database.** Args: dbfilepath: the name and path to the hdf5-database file ProcObj: the pre-processing workflow object pathinh5: the path in the hdf5 file for object storage """ h5objpath = pathinh5 + ProcObj.description h5objpath = re.sub('//', '/', h5objpath) if is_string(dbfilepath) and (os.path.exists(dbfilepath)): h5file_group = h5py.File(dbfilepath, 'a') isdbfile = 1 elif (isinstance(dbfilepath, h5py.File)) or (isinstance( dbfilepath, h5py.Group)): h5file_group = dbfilepath isdbfile = 0 else: return try: objvars = vars(ProcObj) except Exception as inst: printlog(inst) traceback.print_exc() return try: isgroup = h5objpath in h5file_group except Exception as inst: printlog(inst) traceback.print_exc() return if isgroup == False: h5file_group.create_group(h5objpath) else: printlog('%s object has already been saved into the database file' % h5objpath) return h5obj = h5file_group[h5objpath] for i_name in objvars.keys(): subobj = objvars[i_name] if isinstance(subobj, dict): h5obj.create_group(i_name) h5subobj = h5obj[i_name] for j_name in subobj.keys(): save_dataset(h5subobj, j_name, subobj[j_name]) else: save_dataset(h5obj, i_name, objvars[i_name]) printlog('\n%s from pre-processing workflow have been saved to --> %s' % (h5objpath, str(dbfilepath))) if isdbfile == 1: h5file_group.close()