def make_vds(self, f): # virtual dataset layout = h5.VirtualLayout((2, 10), 'f4') vsource1 = h5.VirtualSource(self.f1, 'data', shape=(10, )) vsource2 = h5.VirtualSource(self.f2, 'data', shape=(10, )) layout[0] = vsource1 layout[1] = vsource2 f.create_virtual_dataset('virtual', layout)
def combine_h5(h5dir, out_h5file): filelist = list(pathlib.Path(h5dir).glob('*.h5')) unicode = h5py.special_dtype(vlen=str) n_files = len(filelist) # Get total no. of utts (spks) and no. of frames in .h5 files in the folder n_utts = list() n_frames = list() for i in range(n_files): with h5py.File(filelist[i], 'r') as f: n_utts.append(len(f['utt_ids'])) n_frames.append(f['mfcc'].shape[0]) mfcc_dim = f['mfcc'].shape[1] tot_n_utts = np.sum(n_utts) tot_n_frames = np.sum(n_frames) print(f"Total no. of utts = {tot_n_utts}") print(f"Total no. of frames = {tot_n_frames}") print(f"MFCC dim = {mfcc_dim}") # Assemble virtual dataset utt_layout = h5py.VirtualLayout(shape=(tot_n_utts, ), dtype=unicode) spk_layout = h5py.VirtualLayout(shape=(tot_n_utts, ), dtype=unicode) pos_layout = h5py.VirtualLayout(shape=(tot_n_utts, 2), dtype="int64") mfc_layout = h5py.VirtualLayout(shape=(tot_n_frames, mfcc_dim), dtype="float32") k1 = 0 k2 = 0 for i in range(n_files): print(f"Reading {filelist[i]}") range1 = range(k1, k1 + n_utts[i]) range2 = range(k2, k2 + n_frames[i]) print(f"spk_ids: {range1}") print(f"mfcc: {range2}") utt_layout[range1] = h5py.VirtualSource(filelist[i], "utt_ids", shape=(n_utts[i], )) spk_layout[range1] = h5py.VirtualSource(filelist[i], "spk_ids", shape=(n_utts[i], )) pos_layout[range1] = h5py.VirtualSource(filelist[i], "positions", shape=(n_utts[i], 2)) mfc_layout[range2] = h5py.VirtualSource(filelist[i], "mfcc", shape=(n_frames[i], mfcc_dim)) k1 = k1 + n_utts[i] k2 = k2 + n_frames[i] # Add virtual dataset to output file with h5py.File(out_h5file, "w", libver="latest") as f: print(f"Writing combined file {out_h5file}") f.create_virtual_dataset("utt_ids", utt_layout, fillvalue=None) f.create_virtual_dataset("spk_ids", spk_layout, fillvalue=None) f.create_virtual_dataset("positions", pos_layout, fillvalue=None) f.create_virtual_dataset("mfcc", mfc_layout, fillvalue=None)
def createResource(cls, directory): filename = os.path.join(directory, "base.h5") extH5FileName = os.path.join(directory, "base__external.h5") extDatFileName = os.path.join(directory, "base__external.dat") externalh5 = h5py.File(extH5FileName, mode="w") externalh5["target/dataset"] = 50 externalh5["target/link"] = h5py.SoftLink("/target/dataset") externalh5["/ext/vds0"] = [0, 1] externalh5["/ext/vds1"] = [2, 3] externalh5.close() numpy.array([0, 1, 10, 10, 2, 3]).tofile(extDatFileName) h5 = h5py.File(filename, mode="w") h5["group/dataset"] = 50 h5["link/soft_link"] = h5py.SoftLink("/group/dataset") h5["link/soft_link_to_group"] = h5py.SoftLink("/group") h5["link/soft_link_to_link"] = h5py.SoftLink("/link/soft_link") h5["link/soft_link_to_file"] = h5py.SoftLink("/") h5["group/soft_link_relative"] = h5py.SoftLink("dataset") h5["link/external_link"] = h5py.ExternalLink(extH5FileName, "/target/dataset") h5["link/external_link_to_link"] = h5py.ExternalLink( extH5FileName, "/target/link") h5["broken_link/external_broken_file"] = h5py.ExternalLink( extH5FileName + "_not_exists", "/target/link") h5["broken_link/external_broken_link"] = h5py.ExternalLink( extH5FileName, "/target/not_exists") h5["broken_link/soft_broken_link"] = h5py.SoftLink("/group/not_exists") h5["broken_link/soft_link_to_broken_link"] = h5py.SoftLink( "/group/not_exists") layout = h5py.VirtualLayout((2, 2), dtype=int) layout[0] = h5py.VirtualSource("base__external.h5", name="/ext/vds0", shape=(2, ), dtype=int) layout[1] = h5py.VirtualSource("base__external.h5", name="/ext/vds1", shape=(2, ), dtype=int) h5.create_group("/ext") h5["/ext"].create_virtual_dataset("virtual", layout) external = [("base__external.dat", 0, 2 * 8), ("base__external.dat", 4 * 8, 2 * 8)] h5["/ext"].create_dataset("raw", shape=(2, 2), dtype=int, external=external) h5.close() return filename
def test_extra_args(self): with h5.File(name='f1', driver='core', backing_store=False) as ftest: ftest['a'] = [1, 2, 3] a = ftest['a'] with self.assertRaises(TypeError): h5.VirtualSource(a, 'b') with self.assertRaises(TypeError): h5.VirtualSource(a, shape=(1, )) with self.assertRaises(TypeError): h5.VirtualSource(a, maxshape=(None, )) with self.assertRaises(TypeError): h5.VirtualSource(a, dtype=int)
def virtual_sources(files: List[Path], meta_file: h5py.File) -> VirtualSourceInfo: """ Create HDF5 virtual sources and collate ancillary information from raw data files. Args: files: Lexicographically sorted list of raw file paths. meta_file: Tristan detector metadata file object. Returns: - Dictionary of event data set names and iterators of corresponding HDF5 virtual sources. The iterator of sources for each data set is based on itertools.cycle and so repeats indefinitely in the order in which successive event slices should be selected to build the virtual data set. - Dictionary of cue data set names and lists of corresponding HDF5 virtual sources. The lists of sources have length and order as per the list of input files. - List of the number of cues in each data file after zero-padding has been stripped. Length and order as per the list of input files. - Dictionary of data set names and corresponding data types. """ event_sources = {key: [] for key in event_keys} cue_sources = {key: [] for key in cue_keys} num_cues_per_file = [] with ExitStack() as stack: raw_files = [stack.enter_context(h5py.File(path)) for path in files] dtypes = {key: raw_files[0][key].dtype for key in event_keys + cue_keys} for raw_file in raw_files: # The cues are padded with zeroes. Find the first so we can slice them off. num_cues_per_file.append(np.argmax(raw_file["cue_id"][()] == 0)) for key in event_keys: event_sources[key].append(h5py.VirtualSource(raw_file[key])) for key in cue_keys: cue_sources[key].append(h5py.VirtualSource(raw_file[key])) # Make a list of slices with which to divide the lexicographically sorted list of # file paths into sub-lists, each slice corresponding to a different detector # module. Ordered by module number. Length is equal to the number of modules in # the detector. file_slices = np.pad(np.cumsum(meta_file["fp_per_module"]), (1, 0)) file_slices = list(map(slice, file_slices[:-1], file_slices[1:])) # Construct a carousel to select time slices in the order in which they should # appear in the virtual layout. for key, sources in event_sources.items(): carousel = zip(*(cycle(sources[file_slice]) for file_slice in file_slices)) event_sources[key] = chain.from_iterable(carousel) return event_sources, cue_sources, num_cues_per_file, dtypes
def __init__(self, filenames, default_streams=None): super(H5DatasetLoader, self).__init__() self.filenames = filenames if isinstance(self.filenames, list): self._h5_tempfile = tempfile.NamedTemporaryFile() #self.h5_file = h5py.File(self._h5_tempfile, 'w', libver='latest') self._allfiles, _allstreams, _lengths = zip(*[H5DatasetLoader.load_single_h5(f) for f in self.filenames]) total_len = sum(_lengths) #create virtual datasets of, assumes that all files have the streams of first file and shape of first file ll = (0,) + _lengths ll = np.cumsum(ll) for s in _allstreams[0]: shape = (total_len, ) + self._allfiles[0][s].shape[1:] layout = h5py.VirtualLayout(shape=shape, dtype=self._allfiles[0][s].dtype) for idx, f in enumerate(self._allfiles): vsource = h5py.VirtualSource(f[s]) layout[ll[idx]:ll[idx+1]] = vsource with h5py.File(self._h5_tempfile.name, 'a', libver='latest') as f: f.create_virtual_dataset(s, layout,) self._h5_tempfile.flush() self.h5_file = H5DatasetLoader.load_single_h5(self._h5_tempfile.name)[0] else: self.h5_file = H5DatasetLoader.load_single_h5(self.filenames)[0] self.streams_available = list(self.h5_file.keys()) self.default_streams = default_streams if default_streams is not None: for s in default_streams: assert s in self.streams_available, f"{s} not found in available streams"
def make_linked_stack(self, fullname): """ Actually makes the stacked dataset. This is a separate method since h5py's visit items does not follow external links. fullname string key to the dataset to be converted into a stacked VDS """ datashape = h5.File(self.source_path_pattern % (self.file_numbers[0]))[fullname].shape outshape = (len(self.file_numbers), ) + datashape TGT = h5.VirtualTarget(self.target_path, fullname, shape=outshape) k = 0 VMlist = [] for fnum in self.file_numbers: print fnum source_path = self.source_path_pattern % (fnum) VSRC = h5.VirtualSource(source_path, fullname, shape=datashape) VM = h5.VirtualMap(VSRC, TGT[k:(k + 1):1], dtype=np.float) VMlist.append(VM) k += 1 d = self.outfile.create_virtual_dataset(VMlist=VMlist, fillvalue=0) for key, val in h5.File( self.source_path_pattern % (self.file_numbers[0]))[fullname].attrs.iteritems(): self.outfile[fullname].attrs[key] = val
def writesino(h5name, omegas, dtys, filenames): offset, size, shape, dtype = binary_info( filenames[0][0] ) print(offset,size,shape,dtype) nframes = len( omegas[0] ) * len( omegas ) print(nframes, len(omegas), sum(len(o) for o in omegas)) # Now create a hdf5 file: with h5py.File(h5name, "w", libver='latest' ) as h: # now create a VDS linking within the same file layout = h5py.VirtualLayout( shape = (nframes, shape[0], shape[1] ), dtype = dtype ) j = 0 graw = h.require_group('scans') for i, scan in enumerate(filenames): g = graw.require_group('scan%04d'%(i)) g.create_dataset( "data", shape = (len(scan), shape[0], shape[1]), dtype = dtype, external = [(fname, offset, size) for fname in scan] ) g.create_dataset( "omega" , data = omegas[i] ) g.create_dataset( "dty" , data = dtys[i] ) vsource = h5py.VirtualSource( h.filename, # ok - circular? 'scans/scan%04d/data'%(i), shape = (len(scan), shape[0], shape[1]) ) layout[ j:j+len(scan), :, :] = vsource j += len(scan) g = h.require_group('sinogram') g.create_dataset('omega', data = np.concatenate(omegas) ) g.create_dataset('dty', data = np.concatenate(dtys) ) g.create_virtual_dataset( 'data', layout )
def _assemble_data(self, source, key): """Assemble chunks of data into a virtual layout""" # First, get a list of all non-empty data chunks chunks = [ c for c in self.data._find_data_chunks(source, key) if (c.counts > 0).any() ] chunks.sort(key=lambda c: c.train_ids[0]) if not chunks: return None, None # Create the layout, which will describe what data is where n_total = np.sum([c.counts.sum() for c in chunks]) ds0 = chunks[0].dataset layout = h5py.VirtualLayout(shape=(n_total, ) + ds0.shape[1:], dtype=ds0.dtype) # Map each chunk into the relevant part of the layout output_cursor = np.uint64(0) for chunk in chunks: n = chunk.counts.sum() src = h5py.VirtualSource(chunk.dataset) src = src[chunk.slice] layout[output_cursor:output_cursor + n] = src output_cursor += n assert output_cursor == n_total # Make an array of which train ID each data entry is for: train_ids = np.concatenate( [np.repeat(c.train_ids, c.counts.astype(np.intp)) for c in chunks]) return layout, train_ids
def test_mismatched_selections(self): layout = h5.VirtualLayout((4, 100), 'i4', maxshape=(4, None)) filename = osp.join(self.tmpdir, "1.h5") vsource = h5.VirtualSource(filename, 'data', shape=(100, )) with self.assertRaisesRegex(ValueError, r'different number'): layout[0, :49] = vsource[0:100:2]
def test_eiger_high_level(self): self.outfile = self.working_dir + 'eiger.h5' TGT = h5.VirtualTarget(self.outfile, 'data', shape=(78, 200, 200)) VMlist = [] M_minus_1 = 0 # Create the virtual dataset file with h5.File(self.outfile, 'w', libver='latest') as f: for foo in self.fname: in_data = h5.File(foo)['data'] src_shape = in_data.shape in_data.file.close() M = M_minus_1 + src_shape[0] VSRC = h5.VirtualSource(foo, 'data', shape=src_shape) VM = h5.VirtualMap(VSRC, TGT[M_minus_1:M, :, :], dtype=np.float) VMlist.append(VM) M_minus_1 = M d = f.create_virtual_dataset(VMlist=VMlist, fillvalue=45) f.close() f = h5.File(self.outfile, 'r')['data'] self.assertEqual(f[10, 100, 10], 0.0) self.assertEqual(f[30, 100, 100], 1.0) self.assertEqual(f[50, 100, 100], 2.0) self.assertEqual(f[70, 100, 100], 3.0) f.file.close()
def test_percival_high_level(self): self.outfile = self.working_dir + 'percival.h5' VM = [] # Create the virtual dataset file with h5.File(self.outfile, 'w', libver='latest') as f: TGT = h5.VirtualTarget( self.outfile, 'data', shape=(79, 200, 200), maxshape=(None, 200, 200) ) # Virtual target is a representation of the output dataset k = 0 for foo in self.fname: VSRC = h5.VirtualSource(foo, 'data', shape=(20, 200, 200), maxshape=(None, 200, 200)) VM.append( h5.VirtualMap(VSRC, TGT[k:79:4, :, :], dtype=np.float)) k += 1 f.create_virtual_dataset( VMlist=VM, fillvalue=-5) # pass the fill value and list of maps f.close() f = h5.File(self.outfile, 'r')['data'] sh = f.shape line = f[:8, 100, 100] foo = np.array(2 * range(4)) f.file.close() self.assertEqual( sh, (79, 200, 200), ) np.testing.assert_array_equal(line, foo)
def test_percival_high_level(self): outfile = osp.join(self.working_dir, 'percival.h5') # Virtual layout is a representation of the output dataset layout = h5.VirtualLayout(shape=(79, 200, 200), dtype=np.float) for k, filename in enumerate(self.fname): dim1 = 19 if k == 3 else 20 vsource = h5.VirtualSource(filename, 'data', shape=(dim1, 200, 200)) layout[k:79:4, :, :] = vsource[:, :, :] # Create the virtual dataset file with h5.File(outfile, 'w', libver='latest') as f: f.create_virtual_dataset('data', layout, fillvalue=-5) foo = np.array(2 * list(range(4))) with h5.File(outfile, 'r') as f: ds = f['data'] line = ds[:8, 100, 100] self.assertEqual( ds.shape, (79, 200, 200), ) assert_array_equal(line, foo)
def create_virtual_data(file_pattern, x, entry_key, save_to): files = [file_pattern % el for el in x] files = [(el, f) for f, el in zip(files, x) if os.path.exists(f)] # entry_key = '/ref/power/008' # save_to = "/Users/beauchamplab/rave_data/data_dir/congruency/YAB/rave/data/power/virtual.h5" if len(files) == 0: print('No valid files found') return False # get file shape with h5py.File(files[0][1], 'r') as sample_f: sh = sample_f[entry_key].shape dtype = sample_f[entry_key].dtype layout = h5py.VirtualLayout(shape=(len(files), ) + sh, dtype=dtype) for i, file_dup in enumerate(files): filename = file_dup[1] el = file_dup[0] print(filename) vsource = h5py.VirtualSource(filename, entry_key, shape=sh) layout[i, :, :] = vsource with h5py.File(save_to, 'w', libver='latest') as f: f.create_virtual_dataset(entry_key, layout, fillvalue=np.nan) return True
def _map_layouts(self, layouts): """ Map virtual sources into virtual layouts. Parameters ---------- layouts: dict A dictionary of unmapped virtual layouts. Returns ------- layouts: dict A dictionary of virtual layouts mapped to the virtual sources. """ for name, layout in layouts.items(): key = '{}.{}'.format(self.group_label, name) have_data = np.zeros((self.nframes, self.nmodules), dtype=bool) for source, modno in self.detdata.source_to_modno.items(): print(f" ### Source: {source}, ModNo: {modno}, Key: {key}") module_ix = self._get_module_index(modno) for chunk in self.data._find_data_chunks(source, key): vsrc = h5py.VirtualSource(chunk.dataset) self._map_chunk(chunk, vsrc, layout, module_ix, have_data) filled_pct = 100 * have_data.sum() / have_data.size if hasattr(layout, 'sources'): n_mappings = len(layout.sources) # h5py < 3.3 else: n_mappings = layout.dcpl.get_virtual_count() # h5py >= 3.3 log.info(f"Assembled {n_mappings:d} chunks for {key:s}, " f"filling {filled_pct:.2f}% of the hyperslab") return layouts
def merge(output, h5s): try: dfs = [h5py.File(h5, "r") for h5 in h5s] im_key = list(dfs[0].keys())[0] + "/images" im_shape = dfs[0][im_key].shape[1:] merged_shape = [0] + list(im_shape) for df in dfs: assert df[im_key].shape[ 1:] == im_shape, "Image shape in %s (%s) does not equal %s" % ( df.filename, str(df[im_key].shape[1:]), str(im_shape)) merged_shape[0] += df[im_key].shape[0] merged_shape = tuple(merged_shape) with h5py.File(output, "w") as merged_df: for changrp in dfs[0].keys(): mergedgrp = merged_df.create_group(changrp) for key in dfs[0][changrp].keys(): layout = h5py.VirtualLayout( shape=merged_shape, dtype=dfs[0][changrp][key].dtype) vsources = [] i = 0 for df in dfs: vsources.append( h5py.VirtualSource(df[changrp + "/" + key])) layout[i:i + vsources[-1].shape[0]] = vsources[-1] i += vsources[-1].shape[0] mergedgrp.create_virtual_dataset(key, layout) finally: for df in dfs: df.close()
def joinVDS(infilenames, outfilename): """ creates a new h5py with virtual datasets of all datasets in infilenames[0] with ndim>1 concatenates these datasets from all files in infilenames """ layouts = {} def createlayout(name, obj): if isinstance(obj, h5py.Dataset) and len(obj.shape) > 1: layouts[name] = h5py.VirtualLayout(shape=(0, *obj.shape[1:]), maxshape=(None, *obj.shape[1:]), dtype=obj.dtype) with h5py.File(infilenames[0], "r") as firstfile: firstfile.visititems( createlayout) # instead of enumerating the file to visit subgroups for filename in infilenames: with h5py.File(filename, "r") as currentfile: for key, layout in layouts.items(): vsource = h5py.VirtualSource(currentfile[key]) layout.shape = (layout.shape[0] + vsource.shape[0], *layout.shape[1:]) layout[-vsource.shape[0]:, ...] = vsource[:] with h5py.File(outfilename, "w", libver="latest") as outfile: for key, layout in layouts.items(): outfile.create_virtual_dataset(key, layout, fillvalue=None)
def save_epix(out_file, descriptor, trains, shape, epix_id): """ Save EPIX data to a VDS HDF5 file out_file - HDF5 file descriptor - list of data files to save trains - train IDs to save shape - EPIX data shape epix_id - EPIX detector number """ layout = h5py.VirtualLayout(shape=(trains.size, ) + shape, dtype=np.uint16) counter = 0 for file_name in descriptor: print('Opening file: {}'.format(os.path.basename(file_name))) with h5py.File(file_name, 'r') as data_file: file_trains = data_file[config.EPIX_TRAIN_KEY][:] file_data = data_file[config.EPIX_KEY.format(epix_id)] file_idxs = np.concatenate( [np.where(train_id == file_trains)[0] for train_id in trains]) chunk_size = file_data.chunks[0] num_chunks = int(np.ceil(file_idxs.size / chunk_size)) for chunk in range(num_chunks): start, end = chunk * chunk_size, min(file_data.shape[0], (chunk + 1) * chunk_size) data = h5py.VirtualSource(file_data)[ file_idxs[start:end], :, :] layout[counter:counter + file_idxs[start:end].size] = data counter += file_idxs[start:end].size print('File {0} saved, data size: {1:d}\n'.format( os.path.basename(file_name), counter)) out_file.create_virtual_dataset(config.EPIX_DATA_KEY.format(epix_id), layout)
def h5_virtual_file(filenames, name="data"): """ Assembles a virtual h5 file from multiples """ vsources = [] total_t = 0 for path in filenames: data = h5py.File(path, "r").get(name) t, *features_shape = data.shape total_t += t vsources.append(h5py.VirtualSource(path, name, shape=(t, *features_shape))) # Assemble virtual dataset layout = h5py.VirtualLayout(shape=(total_t, *features_shape), dtype=data.dtype) cursor = 0 for vsource in vsources: # we generate slices like layour[0:10,:,:,:] indices = (slice(cursor, cursor + vsource.shape[0]),) + (slice(None),) * ( len(vsource.shape) - 1 ) layout[indices] = vsource cursor += vsource.shape[0] # Add virtual dataset to output file f = h5py.File(f"{uuid.uuid4()}.h5", "w", libver="latest") f.create_virtual_dataset(name, layout) return f
def test_shape_calculation_positive_step(self): dataset = h5.VirtualSource('test', 'test', (20, )) cmp = [] for i in range(5): d = dataset[2:12 + i:3].shape[0] ref = np.arange(20)[2:12 + i:3].size cmp.append(ref == d) self.assertEqual(5, sum(cmp))
def test_double_strided_range(self): dataset = h5.VirtualSource('test', 'test', (20, 30, 30)) sliced = dataset[6:12:2, :, 20:26:3] self.assertEqual(( 3, 30, 2, ), sliced.shape)
def test_shape_calculation_positive_step_switched_start_stop(self): dataset = h5.VirtualSource('test', 'test', (20, )) cmp = [] for i in range(5): d = dataset[12 + i:2:3].shape[0] ref = np.arange(20)[12 + i:2:3].size print d, ref cmp.append(ref == d) self.assertEqual(5, sum(cmp))
def split(input_h5, output_h5): """Read the data file, create N_FAST * N_SLOW new data sets, then copy the data from the former into the latter and build a VDS""" with h5py.File(input_h5, "r") as fin: frames, slow, fast = fin["data"].shape output_files = [] output_dsets = [] for n in range(len(CHUNKMAP)): filename = output_h5.replace(".h5", "_%02d.h5" % n) fout = h5py.File(filename, "x") # in here I am chunking as 4-module chunks but _maybe_ we should # consider chunking as 1-module chunks and having 4 chunks per # "image" -> :thinking_face: dset = fout.create_dataset( "data", (frames, 4 * MOD_SLOW, MOD_FAST), chunks=(1, 4 * MOD_SLOW, MOD_FAST), compression=bitshuffle.h5.H5FILTER, compression_opts=(0, bitshuffle.h5.H5_COMPRESS_LZ4), dtype=fin["data"].dtype, ) output_files.append((fout, filename)) output_dsets.append(dset) blit(fin["data"], output_dsets) for fout in output_files: fout[0].close() # create VDS layout = h5py.VirtualLayout(shape=(frames, slow, fast), dtype="i4") for i, chunk in enumerate(CHUNKMAP): source = h5py.VirtualSource(output_files[i][1], "data", shape=(frames, 4 * MOD_SLOW, MOD_FAST)) for k, n in enumerate(chunk): s, f = divmod(n, N_FAST) f0 = f * (MOD_FAST + GAP_FAST) f1 = f0 + MOD_FAST s0 = s * (MOD_SLOW + GAP_SLOW) s1 = s0 + MOD_SLOW layout[:, s0:s1, f0:f1] = source[:, k * MOD_SLOW:(k + 1) * MOD_SLOW, :] fout = h5py.File(output_h5, "x") data = fout.create_virtual_dataset("data", layout, fillvalue=-1) for k in "image_nr_low", "image_nr_high": data.attrs.create(k, fin["data"].attrs.get(k), dtype="i4")
def preallocate_output(self, out, parallel_store=False): """ Storage allocation and provisioning Parameters ---------- out : syncopy data object Empty object for holding results parallel_store : bool If `True`, a directory for virtual source files is created in Syncopy's temporary on-disk storage (defined by `syncopy.__storage__`). Otherwise, a dataset of appropriate type and shape is allocated in a new regular HDF5 file created inside Syncopy's temporary storage folder. Returns ------- Nothing : None See also -------- compute : management routine controlling memory pre-allocation """ # In case parallel writing via VDS storage is requested, prepare # directory for by-chunk HDF5 files and construct virtual HDF layout if parallel_store: vdsdir = os.path.splitext(os.path.basename(out.filename))[0] self.virtualDatasetDir = os.path.join(__storage__, vdsdir) os.mkdir(self.virtualDatasetDir) layout = h5py.VirtualLayout(shape=self.outputShape, dtype=self.dtype) for k, idx in enumerate(self.targetLayout): fname = os.path.join(self.virtualDatasetDir, "{0:d}.h5".format(k)) # Catch empty selections: don't map empty sources into the layout of the VDS if all([sel for sel in self.sourceLayout[k]]): layout[idx] = h5py.VirtualSource(fname, self.virtualDatasetNames, shape=self.targetShapes[k]) self.VirtualDatasetLayout = layout self.outFileName = os.path.join(self.virtualDatasetDir, "{0:d}.h5") self.tmpDsetName = self.virtualDatasetNames # Create regular HDF5 dataset for sequential writing else: # The shape of the target depends on trial-averaging if not self.keeptrials: shp = self.cfg["chunkShape"] else: shp = self.outputShape with h5py.File(out.filename, mode="w") as h5f: h5f.create_dataset(name=self.outDatasetName, dtype=self.dtype, shape=shp) self.outFileName = out.filename self.tmpDsetName = self.outDatasetName
def concatenate(file_names_to_concatenate): entry_key = 'data' # where the data is inside of the source files. sh = h5py.File(file_names_to_concatenate[0], 'r')[entry_key].shape # get the first ones shape. layout = h5py.VirtualLayout(shape=(len(file_names_to_concatenate), ) + sh, dtype=np.float64) with h5py.File("VDS.h5", 'w', libver='latest') as f: for i, filename in enumerate(file_names_to_concatenate): vsource = h5py.VirtualSource(filename, entry_key, shape=sh) layout[i, :, :, :] = vsource f.create_virtual_dataset(entry_key, layout, fillvalue=0)
def tile_h5datasets(dest, name, sources, shape_map, tile_shape, nscandim=1): """Merge datasets in a virtual dataset. :param h5py.Group dest: :param str name: :param list(h5py.Dataset) sources: :param dict shape_map: :param int nscandim: start index of the data dimensions """ dset_shapes = [dset.shape for dset in sources] scan_shapes = [dset_shape[:nscandim] for dset_shape in dset_shapes] # F-order det_shapes = [dset_shape[nscandim:] for dset_shape in dset_shapes] reshaped_scan_shapes = [ shape_map.get(scan_shape, scan_shape) for scan_shape in scan_shapes ] # F-order reshaped_scan_shapes = [s[::-1] for s in reshaped_scan_shapes] # C-order reduced_scan_shapes, reshaped_scan_shapes = zip( *(match_shapes([shape1, shape2[::-1]]) for shape1, shape2 in zip(scan_shapes, reshaped_scan_shapes))) reshaped_scan_shapes = [s[::-1] for s in reshaped_scan_shapes] # C-order tile_shape = tile_shape[::-1] # C-order layout_scan_shape, indices = tile_indices(tile_shape, reshaped_scan_shapes, order="C") layout_shape = layout_scan_shape + max_shape(det_shapes) dtype = sources[0].dtype fillvalue = sources[0].fillvalue layout = h5py.VirtualLayout(shape=layout_shape, dtype=dtype) for layout_idx, dset, reduced_scan_shape, det_shape in zip( indices, sources, reduced_scan_shapes, det_shapes): vsource = h5py.VirtualSource( dset.file.filename, dset.name, shape=dset.shape, dtype=dset.dtype, ) reduced_source_shape = reduced_scan_shape + det_shape det_idx = tuple(slice(0, n) for n in det_shape) if reduced_source_shape != vsource.shape: vsource_idx = tuple(slice(0, n) for n in reduced_source_shape) vsource_idx += det_idx vsource = vsource[vsource_idx] layout_idx += det_idx layout[layout_idx] = vsource dest.create_virtual_dataset(name, layout, fillvalue=fillvalue)
def test_check_file(tmp_path): filename = str(tmp_path / 'test.h5') noaccess = (tmp_path / 'noaccess.h5') noaccess.touch() noaccess.chmod(0) with h5py.File(filename, 'w') as f: f['exists'] = np.arange(10, dtype=np.float32) layout = h5py.VirtualLayout((10, 10), np.float32) # 0: valid, accessible mapping layout[0] = h5py.VirtualSource('test.h5', 'exists', (10, )) # 1: file exists, but dataset doesn't layout[1] = h5py.VirtualSource('test.h5', 'nonexists', (10, )) # 2: file doesn't exist layout[2] = h5py.VirtualSource('testnothere.h5', 'nonexists', (10, )) # 3: file exists, but don't have read permission layout[3] = h5py.VirtualSource('noaccess.h5', 'blah', (10, )) f.create_virtual_dataset('vds', layout) assert hdf5_vds_check.check_file(filename) == 3 # 3 inaccessible sources
def test_excalibur_high_level(self): self.outfile = self.working_dir + 'excalibur.h5' f = h5.File(self.outfile, 'w', libver='latest') # create an output file. in_key = 'data' # where is the data at the input? in_sh = h5.File(self.fname[0], 'r')[in_key].shape # get the input shape dtype = h5.File(self.fname[0], 'r')[in_key].dtype # get the datatype # now generate the output shape vertical_gap = 10 # pixels spacing in the vertical nfiles = len(self.fname) print "nfiles is:" + str(nfiles) nframes = in_sh[0] width = in_sh[2] height = (in_sh[1] * nfiles) + (vertical_gap * (nfiles - 1)) out_sh = (nframes, height, width) print out_sh, in_sh TGT = h5.VirtualTarget( self.outfile, 'data', shape=out_sh ) # Virtual target is a representation of the output dataset offset = 0 # initial offset print(offset + in_sh[1]) - offset VMlist = [] # place to put the maps for i in range(nfiles): print("frame_number is: %s, offset is:%s" % (str(i), offset) ) # for feedback VSRC = h5.VirtualSource( self.fname[i], in_key, shape=in_sh) #a representation of the input dataset VM = h5.VirtualMap(VSRC, TGT[:, offset:(offset + in_sh[1]), :], dtype=dtype) # map them with indexing offset += in_sh[1] + vertical_gap # increment the offset VMlist.append(VM) # append it to the list f.create_virtual_dataset( VMlist=VMlist, fillvalue=0x1) # pass the fill value and list of maps f.close() f = h5.File(self.outfile, 'r')['data'] self.assertEqual(f[3, 100, 0], 0.0) self.assertEqual(f[3, 260, 0], 1.0) self.assertEqual(f[3, 350, 0], 3.0) self.assertEqual(f[3, 650, 0], 6.0) self.assertEqual(f[3, 900, 0], 9.0) self.assertEqual(f[3, 1150, 0], 12.0) self.assertEqual(f[3, 1450, 0], 15.0) f.file.close()
def setUp(self): self.tmpdir = tempfile.mkdtemp() self.f1 = osp.join(self.tmpdir, 'testfile1.h5') self.f2 = osp.join(self.tmpdir, 'testfile2.h5') self.data1 = np.arange(10) self.data2 = np.arange(10) * -1 with h5.File(self.f1, 'w') as f: # dataset ds = f.create_dataset('data', (10, ), 'f4') ds[:] = self.data1 with h5.File(self.f2, 'w') as f: # dataset ds = f.create_dataset('data', (10, ), 'f4') ds[:] = self.data2 # virtual dataset layout = h5.VirtualLayout((2, 10), 'f4') vsource1 = h5.VirtualSource(self.f1, 'data', shape=(10, )) vsource2 = h5.VirtualSource(self.f2, 'data', shape=(10, )) layout[0] = vsource1 layout[1] = vsource2 f.create_virtual_dataset('virtual', layout)
def setUp(self): self.tmpdir = tempfile.mkdtemp() self.path = osp.join(self.tmpdir, "resize.h5") with h5.File(self.path, "w") as f: source_dset = f.create_dataset("source", data=np.arange(20), shape=(10, 2), maxshape=(None, 2), chunks=(10, 1), fillvalue=-1) self.layout = h5.VirtualLayout((10, 1), np.int, maxshape=(None, 1)) layout_source = h5.VirtualSource(source_dset) self.layout[:h5.UNLIMITED, 0] = layout_source[:h5.UNLIMITED, 1] f.create_virtual_dataset("virtual", self.layout)