Exemplo n.º 1
0
def virtual_layouts(num_events: int, num_cues: int, dtypes: Dict[str, type]) -> Layouts:
    """Create a dictionary of data set names and corresponding HDF5 virtual layouts."""
    layouts = {}
    for key in event_keys:
        layouts[key] = h5py.VirtualLayout(shape=(num_events,), dtype=dtypes[key])
    for key in cue_keys:
        layouts[key] = h5py.VirtualLayout(shape=(num_cues,), dtype=dtypes[key])
    return layouts
Exemplo n.º 2
0
def combine_h5(h5dir, out_h5file):
    filelist = list(pathlib.Path(h5dir).glob('*.h5'))
    unicode = h5py.special_dtype(vlen=str)
    n_files = len(filelist)

    # Get total no. of utts (spks) and no. of frames in .h5 files in the folder
    n_utts = list()
    n_frames = list()
    for i in range(n_files):
        with h5py.File(filelist[i], 'r') as f:
            n_utts.append(len(f['utt_ids']))
            n_frames.append(f['mfcc'].shape[0])
            mfcc_dim = f['mfcc'].shape[1]
    tot_n_utts = np.sum(n_utts)
    tot_n_frames = np.sum(n_frames)
    print(f"Total no. of utts = {tot_n_utts}")
    print(f"Total no. of frames = {tot_n_frames}")
    print(f"MFCC dim = {mfcc_dim}")

    # Assemble virtual dataset
    utt_layout = h5py.VirtualLayout(shape=(tot_n_utts, ), dtype=unicode)
    spk_layout = h5py.VirtualLayout(shape=(tot_n_utts, ), dtype=unicode)
    pos_layout = h5py.VirtualLayout(shape=(tot_n_utts, 2), dtype="int64")
    mfc_layout = h5py.VirtualLayout(shape=(tot_n_frames, mfcc_dim),
                                    dtype="float32")
    k1 = 0
    k2 = 0
    for i in range(n_files):
        print(f"Reading {filelist[i]}")
        range1 = range(k1, k1 + n_utts[i])
        range2 = range(k2, k2 + n_frames[i])
        print(f"spk_ids: {range1}")
        print(f"mfcc: {range2}")
        utt_layout[range1] = h5py.VirtualSource(filelist[i],
                                                "utt_ids",
                                                shape=(n_utts[i], ))
        spk_layout[range1] = h5py.VirtualSource(filelist[i],
                                                "spk_ids",
                                                shape=(n_utts[i], ))
        pos_layout[range1] = h5py.VirtualSource(filelist[i],
                                                "positions",
                                                shape=(n_utts[i], 2))
        mfc_layout[range2] = h5py.VirtualSource(filelist[i],
                                                "mfcc",
                                                shape=(n_frames[i], mfcc_dim))
        k1 = k1 + n_utts[i]
        k2 = k2 + n_frames[i]

    # Add virtual dataset to output file
    with h5py.File(out_h5file, "w", libver="latest") as f:
        print(f"Writing combined file {out_h5file}")
        f.create_virtual_dataset("utt_ids", utt_layout, fillvalue=None)
        f.create_virtual_dataset("spk_ids", spk_layout, fillvalue=None)
        f.create_virtual_dataset("positions", pos_layout, fillvalue=None)
        f.create_virtual_dataset("mfcc", mfc_layout, fillvalue=None)
Exemplo n.º 3
0
    def test_mismatched_selections(self):
        layout = h5.VirtualLayout((4, 100), 'i4', maxshape=(4, None))

        filename = osp.join(self.tmpdir, "1.h5")
        vsource = h5.VirtualSource(filename, 'data', shape=(100, ))
        with self.assertRaisesRegex(ValueError, r'different number'):
            layout[0, :49] = vsource[0:100:2]
Exemplo n.º 4
0
def h5_virtual_file(filenames, name="data"):
    """
    Assembles a virtual h5 file from multiples
    """
    vsources = []
    total_t = 0
    for path in filenames:
        data = h5py.File(path, "r").get(name)
        t, *features_shape = data.shape
        total_t += t
        vsources.append(h5py.VirtualSource(path, name, shape=(t, *features_shape)))

    # Assemble virtual dataset
    layout = h5py.VirtualLayout(shape=(total_t, *features_shape), dtype=data.dtype)
    cursor = 0
    for vsource in vsources:
        # we generate slices like layour[0:10,:,:,:]
        indices = (slice(cursor, cursor + vsource.shape[0]),) + (slice(None),) * (
            len(vsource.shape) - 1
        )
        layout[indices] = vsource
        cursor += vsource.shape[0]
    # Add virtual dataset to output file
    f = h5py.File(f"{uuid.uuid4()}.h5", "w", libver="latest")
    f.create_virtual_dataset(name, layout)
    return f
Exemplo n.º 5
0
def merge(output, h5s):
    try:
        dfs = [h5py.File(h5, "r") for h5 in h5s]

        im_key = list(dfs[0].keys())[0] + "/images"
        im_shape = dfs[0][im_key].shape[1:]
        merged_shape = [0] + list(im_shape)
        for df in dfs:
            assert df[im_key].shape[
                1:] == im_shape, "Image shape in %s (%s) does not equal %s" % (
                    df.filename, str(df[im_key].shape[1:]), str(im_shape))
            merged_shape[0] += df[im_key].shape[0]

        merged_shape = tuple(merged_shape)

        with h5py.File(output, "w") as merged_df:
            for changrp in dfs[0].keys():
                mergedgrp = merged_df.create_group(changrp)
                for key in dfs[0][changrp].keys():
                    layout = h5py.VirtualLayout(
                        shape=merged_shape, dtype=dfs[0][changrp][key].dtype)
                    vsources = []
                    i = 0
                    for df in dfs:
                        vsources.append(
                            h5py.VirtualSource(df[changrp + "/" + key]))
                        layout[i:i + vsources[-1].shape[0]] = vsources[-1]

                        i += vsources[-1].shape[0]

                    mergedgrp.create_virtual_dataset(key, layout)

    finally:
        for df in dfs:
            df.close()
Exemplo n.º 6
0
    def _assemble_data(self, source, key):
        """Assemble chunks of data into a virtual layout"""
        # First, get a list of all non-empty data chunks
        chunks = [
            c for c in self.data._find_data_chunks(source, key)
            if (c.counts > 0).any()
        ]
        chunks.sort(key=lambda c: c.train_ids[0])
        if not chunks:
            return None, None

        # Create the layout, which will describe what data is where
        n_total = np.sum([c.counts.sum() for c in chunks])
        ds0 = chunks[0].dataset
        layout = h5py.VirtualLayout(shape=(n_total, ) + ds0.shape[1:],
                                    dtype=ds0.dtype)

        # Map each chunk into the relevant part of the layout
        output_cursor = np.uint64(0)
        for chunk in chunks:
            n = chunk.counts.sum()
            src = h5py.VirtualSource(chunk.dataset)
            src = src[chunk.slice]
            layout[output_cursor:output_cursor + n] = src
            output_cursor += n

        assert output_cursor == n_total

        # Make an array of which train ID each data entry is for:
        train_ids = np.concatenate(
            [np.repeat(c.train_ids, c.counts.astype(np.intp)) for c in chunks])
        return layout, train_ids
Exemplo n.º 7
0
def create_virtual_data(file_pattern, x, entry_key, save_to):
    files = [file_pattern % el for el in x]
    files = [(el, f) for f, el in zip(files, x) if os.path.exists(f)]
    # entry_key = '/ref/power/008'
    # save_to = "/Users/beauchamplab/rave_data/data_dir/congruency/YAB/rave/data/power/virtual.h5"

    if len(files) == 0:
        print('No valid files found')
        return False

    # get file shape
    with h5py.File(files[0][1], 'r') as sample_f:
        sh = sample_f[entry_key].shape
        dtype = sample_f[entry_key].dtype

    layout = h5py.VirtualLayout(shape=(len(files), ) + sh, dtype=dtype)
    for i, file_dup in enumerate(files):
        filename = file_dup[1]
        el = file_dup[0]
        print(filename)
        vsource = h5py.VirtualSource(filename, entry_key, shape=sh)
        layout[i, :, :] = vsource

    with h5py.File(save_to, 'w', libver='latest') as f:
        f.create_virtual_dataset(entry_key, layout, fillvalue=np.nan)

    return True
Exemplo n.º 8
0
def writesino(h5name, omegas, dtys, filenames):
    offset, size, shape, dtype = binary_info( filenames[0][0] )
    print(offset,size,shape,dtype)
    nframes = len( omegas[0] ) * len( omegas )
    print(nframes, len(omegas), sum(len(o) for o in omegas))
    # Now create a hdf5 file:
    with h5py.File(h5name, "w", libver='latest' ) as h:
        # now create a VDS linking within the same file
        layout = h5py.VirtualLayout( shape = (nframes, shape[0], shape[1] ),
                                     dtype = dtype )
        j = 0
        graw = h.require_group('scans')
        for i, scan in enumerate(filenames):
            g = graw.require_group('scan%04d'%(i))
            g.create_dataset( "data",
                              shape = (len(scan), shape[0], shape[1]),
                              dtype = dtype,
                              external = [(fname, offset, size) for fname in scan] )
            g.create_dataset( "omega" , data = omegas[i] )
            g.create_dataset( "dty" , data = dtys[i] )
            vsource = h5py.VirtualSource( h.filename, # ok - circular?
                                          'scans/scan%04d/data'%(i),
                                          shape = (len(scan), shape[0], shape[1]) )
            layout[ j:j+len(scan), :, :] = vsource
            j += len(scan)
        g = h.require_group('sinogram')
        g.create_dataset('omega', data = np.concatenate(omegas) )
        g.create_dataset('dty', data = np.concatenate(dtys) )
        g.create_virtual_dataset( 'data', layout )
Exemplo n.º 9
0
def save_epix(out_file, descriptor, trains, shape, epix_id):
    """
    Save EPIX data to a VDS HDF5 file

    out_file - HDF5 file
    descriptor - list of data files to save
    trains - train IDs to save
    shape - EPIX data shape
    epix_id - EPIX detector number
    """
    layout = h5py.VirtualLayout(shape=(trains.size, ) + shape, dtype=np.uint16)
    counter = 0
    for file_name in descriptor:
        print('Opening file: {}'.format(os.path.basename(file_name)))
        with h5py.File(file_name, 'r') as data_file:
            file_trains = data_file[config.EPIX_TRAIN_KEY][:]
            file_data = data_file[config.EPIX_KEY.format(epix_id)]
            file_idxs = np.concatenate(
                [np.where(train_id == file_trains)[0] for train_id in trains])
            chunk_size = file_data.chunks[0]
            num_chunks = int(np.ceil(file_idxs.size / chunk_size))
            for chunk in range(num_chunks):
                start, end = chunk * chunk_size, min(file_data.shape[0],
                                                     (chunk + 1) * chunk_size)
                data = h5py.VirtualSource(file_data)[
                    file_idxs[start:end], :, :]
                layout[counter:counter + file_idxs[start:end].size] = data
                counter += file_idxs[start:end].size
        print('File {0} saved, data size: {1:d}\n'.format(
            os.path.basename(file_name), counter))
    out_file.create_virtual_dataset(config.EPIX_DATA_KEY.format(epix_id),
                                    layout)
Exemplo n.º 10
0
    def __init__(self, filenames, default_streams=None):
        super(H5DatasetLoader, self).__init__()
        self.filenames = filenames
        if isinstance(self.filenames, list):
            self._h5_tempfile = tempfile.NamedTemporaryFile()
            #self.h5_file = h5py.File(self._h5_tempfile, 'w', libver='latest')

            self._allfiles, _allstreams, _lengths = zip(*[H5DatasetLoader.load_single_h5(f) for f in self.filenames])

            total_len = sum(_lengths)

            #create virtual datasets of, assumes that all files have the streams of first file and shape of first file
            ll = (0,) + _lengths
            ll = np.cumsum(ll)
            for s in _allstreams[0]:
                shape = (total_len, ) + self._allfiles[0][s].shape[1:]
                layout = h5py.VirtualLayout(shape=shape, dtype=self._allfiles[0][s].dtype)

                for idx, f in enumerate(self._allfiles):
                    vsource = h5py.VirtualSource(f[s])
                    layout[ll[idx]:ll[idx+1]] = vsource

                with h5py.File(self._h5_tempfile.name, 'a', libver='latest') as f:
                    f.create_virtual_dataset(s, layout,)
            self._h5_tempfile.flush()
            self.h5_file = H5DatasetLoader.load_single_h5(self._h5_tempfile.name)[0]
        else:
            self.h5_file = H5DatasetLoader.load_single_h5(self.filenames)[0]
        self.streams_available = list(self.h5_file.keys())
        self.default_streams = default_streams

        if default_streams is not None:
            for s in default_streams:
                assert s in self.streams_available, f"{s} not found in available streams"
Exemplo n.º 11
0
    def test_percival_high_level(self):
        outfile = osp.join(self.working_dir, 'percival.h5')

        # Virtual layout is a representation of the output dataset
        layout = h5.VirtualLayout(shape=(79, 200, 200), dtype=np.float)
        for k, filename in enumerate(self.fname):
            dim1 = 19 if k == 3 else 20
            vsource = h5.VirtualSource(filename,
                                       'data',
                                       shape=(dim1, 200, 200))
            layout[k:79:4, :, :] = vsource[:, :, :]

        # Create the virtual dataset file
        with h5.File(outfile, 'w', libver='latest') as f:
            f.create_virtual_dataset('data', layout, fillvalue=-5)

        foo = np.array(2 * list(range(4)))
        with h5.File(outfile, 'r') as f:
            ds = f['data']
            line = ds[:8, 100, 100]
            self.assertEqual(
                ds.shape,
                (79, 200, 200),
            )
            assert_array_equal(line, foo)
Exemplo n.º 12
0
 def make_vds(self, f):
     # virtual dataset
     layout = h5.VirtualLayout((2, 10), 'f4')
     vsource1 = h5.VirtualSource(self.f1, 'data', shape=(10, ))
     vsource2 = h5.VirtualSource(self.f2, 'data', shape=(10, ))
     layout[0] = vsource1
     layout[1] = vsource2
     f.create_virtual_dataset('virtual', layout)
Exemplo n.º 13
0
    def build_virtual_layout(sources, source_shapes, dtype):
        virtual_layout = h5py.VirtualLayout(shape=source_shapes, dtype=dtype)
        offset = 0
        for source in sources:
            length = source.shape[0]
            virtual_layout[offset:offset + length] = source
            offset += length

        return virtual_layout
def split(input_h5, output_h5):
    """Read the data file, create N_FAST * N_SLOW new data sets, then
    copy the data from the former into the latter and build a VDS"""

    with h5py.File(input_h5, "r") as fin:
        frames, slow, fast = fin["data"].shape

        output_files = []
        output_dsets = []
        for n in range(len(CHUNKMAP)):
            filename = output_h5.replace(".h5", "_%02d.h5" % n)
            fout = h5py.File(filename, "x")

            # in here I am chunking as 4-module chunks but _maybe_ we should
            # consider chunking as 1-module chunks and having 4 chunks per
            # "image" -> :thinking_face:

            dset = fout.create_dataset(
                "data",
                (frames, 4 * MOD_SLOW, MOD_FAST),
                chunks=(1, 4 * MOD_SLOW, MOD_FAST),
                compression=bitshuffle.h5.H5FILTER,
                compression_opts=(0, bitshuffle.h5.H5_COMPRESS_LZ4),
                dtype=fin["data"].dtype,
            )

            output_files.append((fout, filename))
            output_dsets.append(dset)

        blit(fin["data"], output_dsets)

        for fout in output_files:
            fout[0].close()

        # create VDS
        layout = h5py.VirtualLayout(shape=(frames, slow, fast), dtype="i4")

        for i, chunk in enumerate(CHUNKMAP):
            source = h5py.VirtualSource(output_files[i][1],
                                        "data",
                                        shape=(frames, 4 * MOD_SLOW, MOD_FAST))
            for k, n in enumerate(chunk):
                s, f = divmod(n, N_FAST)
                f0 = f * (MOD_FAST + GAP_FAST)
                f1 = f0 + MOD_FAST
                s0 = s * (MOD_SLOW + GAP_SLOW)
                s1 = s0 + MOD_SLOW
                layout[:, s0:s1,
                       f0:f1] = source[:, k * MOD_SLOW:(k + 1) * MOD_SLOW, :]

        fout = h5py.File(output_h5, "x")
        data = fout.create_virtual_dataset("data", layout, fillvalue=-1)
        for k in "image_nr_low", "image_nr_high":
            data.attrs.create(k, fin["data"].attrs.get(k), dtype="i4")
Exemplo n.º 15
0
    def preallocate_output(self, out, parallel_store=False):
        """
        Storage allocation and provisioning

        Parameters
        ----------
        out : syncopy data object
           Empty object for holding results
        parallel_store : bool
           If `True`, a directory for virtual source files is created
           in Syncopy's temporary on-disk storage (defined by `syncopy.__storage__`).
           Otherwise, a dataset of appropriate type and shape is allocated
           in a new regular HDF5 file created inside Syncopy's temporary
           storage folder.

        Returns
        -------
        Nothing : None

        See also
        --------
        compute : management routine controlling memory pre-allocation
        """

        # In case parallel writing via VDS storage is requested, prepare
        # directory for by-chunk HDF5 files and construct virtual HDF layout
        if parallel_store:
            vdsdir = os.path.splitext(os.path.basename(out.filename))[0]
            self.virtualDatasetDir = os.path.join(__storage__, vdsdir)
            os.mkdir(self.virtualDatasetDir)

            layout = h5py.VirtualLayout(shape=self.outputShape, dtype=self.dtype)
            for k, idx in enumerate(self.targetLayout):
                fname = os.path.join(self.virtualDatasetDir, "{0:d}.h5".format(k))
                # Catch empty selections: don't map empty sources into the layout of the VDS
                if all([sel for sel in self.sourceLayout[k]]):
                    layout[idx] = h5py.VirtualSource(fname, self.virtualDatasetNames, shape=self.targetShapes[k])
            self.VirtualDatasetLayout = layout
            self.outFileName = os.path.join(self.virtualDatasetDir, "{0:d}.h5")
            self.tmpDsetName = self.virtualDatasetNames

        # Create regular HDF5 dataset for sequential writing
        else:

            # The shape of the target depends on trial-averaging
            if not self.keeptrials:
                shp = self.cfg["chunkShape"]
            else:
                shp = self.outputShape
            with h5py.File(out.filename, mode="w") as h5f:
                h5f.create_dataset(name=self.outDatasetName,
                                   dtype=self.dtype, shape=shp)
            self.outFileName = out.filename
            self.tmpDsetName = self.outDatasetName
Exemplo n.º 16
0
def concatenate(file_names_to_concatenate):
    entry_key = 'data'  # where the data is inside of the source files.
    sh = h5py.File(file_names_to_concatenate[0],
                   'r')[entry_key].shape  # get the first ones shape.
    layout = h5py.VirtualLayout(shape=(len(file_names_to_concatenate), ) + sh,
                                dtype=np.float64)
    with h5py.File("VDS.h5", 'w', libver='latest') as f:
        for i, filename in enumerate(file_names_to_concatenate):
            vsource = h5py.VirtualSource(filename, entry_key, shape=sh)
            layout[i, :, :, :] = vsource

        f.create_virtual_dataset(entry_key, layout, fillvalue=0)
Exemplo n.º 17
0
    def createResource(cls, directory):
        filename = os.path.join(directory, "base.h5")
        extH5FileName = os.path.join(directory, "base__external.h5")
        extDatFileName = os.path.join(directory, "base__external.dat")

        externalh5 = h5py.File(extH5FileName, mode="w")
        externalh5["target/dataset"] = 50
        externalh5["target/link"] = h5py.SoftLink("/target/dataset")
        externalh5["/ext/vds0"] = [0, 1]
        externalh5["/ext/vds1"] = [2, 3]
        externalh5.close()

        numpy.array([0, 1, 10, 10, 2, 3]).tofile(extDatFileName)

        h5 = h5py.File(filename, mode="w")
        h5["group/dataset"] = 50
        h5["link/soft_link"] = h5py.SoftLink("/group/dataset")
        h5["link/soft_link_to_group"] = h5py.SoftLink("/group")
        h5["link/soft_link_to_link"] = h5py.SoftLink("/link/soft_link")
        h5["link/soft_link_to_file"] = h5py.SoftLink("/")
        h5["group/soft_link_relative"] = h5py.SoftLink("dataset")
        h5["link/external_link"] = h5py.ExternalLink(extH5FileName,
                                                     "/target/dataset")
        h5["link/external_link_to_link"] = h5py.ExternalLink(
            extH5FileName, "/target/link")
        h5["broken_link/external_broken_file"] = h5py.ExternalLink(
            extH5FileName + "_not_exists", "/target/link")
        h5["broken_link/external_broken_link"] = h5py.ExternalLink(
            extH5FileName, "/target/not_exists")
        h5["broken_link/soft_broken_link"] = h5py.SoftLink("/group/not_exists")
        h5["broken_link/soft_link_to_broken_link"] = h5py.SoftLink(
            "/group/not_exists")
        layout = h5py.VirtualLayout((2, 2), dtype=int)
        layout[0] = h5py.VirtualSource("base__external.h5",
                                       name="/ext/vds0",
                                       shape=(2, ),
                                       dtype=int)
        layout[1] = h5py.VirtualSource("base__external.h5",
                                       name="/ext/vds1",
                                       shape=(2, ),
                                       dtype=int)
        h5.create_group("/ext")
        h5["/ext"].create_virtual_dataset("virtual", layout)
        external = [("base__external.dat", 0, 2 * 8),
                    ("base__external.dat", 4 * 8, 2 * 8)]
        h5["/ext"].create_dataset("raw",
                                  shape=(2, 2),
                                  dtype=int,
                                  external=external)
        h5.close()

        return filename
Exemplo n.º 18
0
def tile_h5datasets(dest, name, sources, shape_map, tile_shape, nscandim=1):
    """Merge datasets in a virtual dataset.

    :param h5py.Group dest:
    :param str name:
    :param list(h5py.Dataset) sources:
    :param dict shape_map:
    :param int nscandim: start index of the data dimensions
    """
    dset_shapes = [dset.shape for dset in sources]
    scan_shapes = [dset_shape[:nscandim]
                   for dset_shape in dset_shapes]  # F-order
    det_shapes = [dset_shape[nscandim:] for dset_shape in dset_shapes]

    reshaped_scan_shapes = [
        shape_map.get(scan_shape, scan_shape) for scan_shape in scan_shapes
    ]  # F-order
    reshaped_scan_shapes = [s[::-1] for s in reshaped_scan_shapes]  # C-order

    reduced_scan_shapes, reshaped_scan_shapes = zip(
        *(match_shapes([shape1, shape2[::-1]])
          for shape1, shape2 in zip(scan_shapes, reshaped_scan_shapes)))
    reshaped_scan_shapes = [s[::-1] for s in reshaped_scan_shapes]  # C-order
    tile_shape = tile_shape[::-1]  # C-order

    layout_scan_shape, indices = tile_indices(tile_shape,
                                              reshaped_scan_shapes,
                                              order="C")

    layout_shape = layout_scan_shape + max_shape(det_shapes)

    dtype = sources[0].dtype
    fillvalue = sources[0].fillvalue
    layout = h5py.VirtualLayout(shape=layout_shape, dtype=dtype)
    for layout_idx, dset, reduced_scan_shape, det_shape in zip(
            indices, sources, reduced_scan_shapes, det_shapes):
        vsource = h5py.VirtualSource(
            dset.file.filename,
            dset.name,
            shape=dset.shape,
            dtype=dset.dtype,
        )
        reduced_source_shape = reduced_scan_shape + det_shape
        det_idx = tuple(slice(0, n) for n in det_shape)
        if reduced_source_shape != vsource.shape:
            vsource_idx = tuple(slice(0, n) for n in reduced_source_shape)
            vsource_idx += det_idx
            vsource = vsource[vsource_idx]
        layout_idx += det_idx
        layout[layout_idx] = vsource
    dest.create_virtual_dataset(name, layout, fillvalue=fillvalue)
Exemplo n.º 19
0
    def test_index_layout(self):
        # Assemble virtual dataset (indexing target)
        layout = h5.VirtualLayout((100, ), 'i4')

        inds = [3, 6, 20, 25, 33, 47, 70, 75, 96, 98]

        filename = osp.join(self.tmpdir, "1.h5")
        vsource = h5.VirtualSource(filename, 'data', shape=(10, ))
        layout[inds] = vsource

        outfile = osp.join(self.tmpdir, 'VDS.h5')

        # Assembly virtual dataset (indexing source)
        layout2 = h5.VirtualLayout((6, ), 'i4')

        inds2 = [0, 1, 4, 5, 8]
        layout2[1:] = vsource[inds2]

        # Add virtual datasets to output file and close
        with h5.File(outfile, 'w', libver='latest') as f:
            f.create_virtual_dataset('/data', layout, fillvalue=-5)
            f.create_virtual_dataset('/data2', layout2, fillvalue=-3)

        # Read data from virtual datasets
        with h5.File(outfile, 'r') as f:
            data = f['/data'][()]
            data2 = f['/data2'][()]

        # Verify
        assert_array_equal(data[inds], np.arange(10) * 10)
        assert_array_equal(data2[1:], [0, 10, 40, 50, 80])

        mask = np.zeros(100)
        mask[inds] = 1
        self.assertEqual(data[mask == 0].min(), -5)
        self.assertEqual(data[mask == 0].max(), -5)
        self.assertEqual(data2[0], -3)
Exemplo n.º 20
0
    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()
        self.path = osp.join(self.tmpdir, "resize.h5")
        with h5.File(self.path, "w") as f:
            source_dset = f.create_dataset("source",
                                           data=np.arange(20),
                                           shape=(10, 2),
                                           maxshape=(None, 2),
                                           chunks=(10, 1),
                                           fillvalue=-1)
            self.layout = h5.VirtualLayout((10, 1), np.int, maxshape=(None, 1))
            layout_source = h5.VirtualSource(source_dset)
            self.layout[:h5.UNLIMITED, 0] = layout_source[:h5.UNLIMITED, 1]

            f.create_virtual_dataset("virtual", self.layout)
Exemplo n.º 21
0
def combine_h5(filelist, out_h5file):
    n_files = len(filelist)

    n_x = list()
    n_y = list()
    for i in range(n_files):
        with h5py.File(filelist[i], 'r') as f:
            n_x.append((f['x'].shape[0]))
            n_y.append(len(f['y']))
            x_dim = f['x'].shape[1]
    tot_n_x = np.sum(n_x)
    tot_n_y = np.sum(n_y)
    print(f"Total no. of x = {tot_n_x}")
    print(f"Total no. of y = {tot_n_y}")
    print(f"Feature vectors dim = {x_dim}")

    # Assemble virtual dataset
    x_layout = h5py.VirtualLayout(shape=(tot_n_x, x_dim), dtype=np.float32)
    y_layout = h5py.VirtualLayout(shape=(tot_n_y, ), dtype=np.int32)
    k1 = 0
    for i in range(n_files):
        print(f"Reading {filelist[i]}")
        range1 = range(k1, k1 + n_x[i])
        x_layout[list(range1)] = h5py.VirtualSource(filelist[i],
                                                    "x",
                                                    shape=(n_x[i], x_dim))
        y_layout[list(range1)] = h5py.VirtualSource(filelist[i],
                                                    "y",
                                                    shape=(n_y[i], ))
        k1 = k1 + n_x[i]

    # Add virtual dataset to output file
    with h5py.File(out_h5file, "w", libver="latest") as f:
        print(f"Writing combined file {out_h5file}")
        f.create_virtual_dataset("x", x_layout, fillvalue=None)
        f.create_virtual_dataset("y", y_layout, fillvalue=None)
Exemplo n.º 22
0
def concatenate_virtual_h5(input_file_names: List[str],
                           output_name: str,
                           fields: Optional[List[str]] = None):
    r"""Concatenate HDF5 files into a virtual HDF5 file.

    Concatenates a list `input_file_names` of HDF5 files containing
    the same format into a single virtual dataset.

    Parameters
    ----------
    input_file_names : List[str]
        List of HDF5 file names to concatenate.
    output_name : str
        Name of output virtual HDF5 file.
    fields : Optional[List[str]]
        Which dataset fields to concatenate. Will concatenate all fields by default.
    """

    # Open first file to get dataset shape and dtype
    # Assumes uniform number of data points per file
    h5_file = h5py.File(input_file_names[0], "r")

    if not fields:
        fields = list(h5_file.keys())

    # Helper function to output concatenated shape
    def concat_shape(shape: Tuple[int]) -> Tuple[int]:
        return (len(input_file_names) * shape[0], *shape[1:])

    # Create a virtual layout for each input field
    layouts = {
        field: h5py.VirtualLayout(
            shape=concat_shape(h5_file[field].shape),
            dtype=h5_file[field].dtype,
        )
        for field in fields
    }

    with h5py.File(output_name, "w", libver="latest") as f:
        for field in fields:
            for i, filename in enumerate(input_file_names):
                shape = h5_file[field].shape
                vsource = h5py.VirtualSource(filename, field, shape=shape)
                layouts[field][i * shape[0]:(i + 1) * shape[0], ...] = vsource

            f.create_virtual_dataset(field, layouts[field])

    h5_file.close()
Exemplo n.º 23
0
    def finalize(self):
        vds_shape = (len(self.__idxs), ) + self.__shape

        # Assemble virtual dataset
        layout = h5py.VirtualLayout(shape=vds_shape, dtype=self.__dtype)
        for key in self.__idxs.keys():
            filename = self.__idxs[key]
            vsource = h5py.VirtualSource(filename,
                                         self.__opath,
                                         shape=self.__shape)
            layout[key] = vsource

        with h5py.File(self.foname, "w", libver="latest") as f:
            f.create_virtual_dataset(self.dspath, layout, fillvalue=-5)
        self.LogInfo("finalized, close HDF5 File: " + self.foname)
        return True
Exemplo n.º 24
0
def split(input_h5, output_h5):
    """Read the data file, create N_FAST * N_SLOW new data sets, then
    copy the data from the former into the latter and build a VDS"""

    with h5py.File(input_h5, "r") as fin:
        frames, slow, fast = fin["data"].shape

        output_files = []
        output_dsets = []
        for n in range(N_FAST * N_SLOW):
            filename = output_h5.replace(".h5", "_%02d.h5" % n)
            fout = h5py.File(filename, "x")
            dset = fout.create_dataset(
                "data",
                (frames, MOD_SLOW, MOD_FAST),
                chunks=(1, MOD_SLOW, MOD_FAST),
                compression=bitshuffle.h5.H5FILTER,
                compression_opts=(0, bitshuffle.h5.H5_COMPRESS_LZ4),
                dtype=fin["data"].dtype,
            )
            output_files.append((fout, filename))
            output_dsets.append(dset)

        blit(fin["data"], output_dsets)

        for fout in output_files:
            fout[0].close()

        # create VDS
        layout = h5py.VirtualLayout(shape=(frames, slow, fast), dtype="i4")

        for n in range(N_SLOW * N_FAST):
            s, f = divmod(n, N_FAST)
            source = h5py.VirtualSource(output_files[n][1],
                                        "data",
                                        shape=(frames, MOD_SLOW, MOD_FAST))
            f0 = f * (MOD_FAST + GAP_FAST)
            f1 = f0 + MOD_FAST
            s0 = s * (MOD_SLOW + GAP_SLOW)
            s1 = s0 + MOD_SLOW
            layout[:, s0:s1, f0:f1] = source

        fout = h5py.File(output_h5, "x")
        data = fout.create_virtual_dataset("data", layout, fillvalue=-1)
        for k in "image_nr_low", "image_nr_high":
            data.attrs.create(k, fin["data"].attrs.get(k), dtype="i4")
Exemplo n.º 25
0
def join(target_name, feature_names, databases):

    with h5py.File(target_name, "w") as f:
        for feat_name in feature_names:
            lengths = [getattr(db, feat_name).shape[0] for db in databases]
            dim = set([getattr(db, feat_name).shape[1]
                       for db in databases]).pop()
            layout = h5py.VirtualLayout(shape=(sum(lengths), dim))
            offset = 0
            for i, n in enumerate(lengths):
                vsource = h5py.VirtualSource(databases[i].h5_file,
                                             feat_name,
                                             shape=(n, dim))
                layout[offset:offset + n] = vsource
                offset += n
            ds = f.create_virtual_dataset(feat_name, layout)
            ds.attrs = getattr(databases[0], feat_name).attrs
Exemplo n.º 26
0
def combineFiles(fileNames, keys):

    '''
    Use a virtual dataset in a temporary .h5 file to combine files with entries of the same shape,
    so that they appear to be one contiguous dataset.
    '''

    tmpFile = '/tmp/tmpVDS.h5'

    if os.path.exists(tmpFile):
        os.remove(tmpFile)

    for key in keys:

        sources = []
        totalLength = 0
        shape = None

        for fileName in fileNames:
            with h5py.File(fileName, 'r') as tmpF:

                source = h5py.VirtualSource(tmpF[key])

                shape = source.shape[1:]

                totalLength += source.shape[0]

                sources.append(source)

        layout = h5py.VirtualLayout(shape = (totalLength,) + tuple(shape),
                                    dtype = np.float)

        offset = 0
        for source in sources:

            length = source.shape[0]

            layout[offset : offset + length] = source

            offset += length

        with h5py.File(tmpFile, 'a', libver = 'latest') as f:
            f.create_virtual_dataset(key, layout, fillvalue = np.nan)

    return tmpFile
Exemplo n.º 27
0
    def construct_virtual_sources(self, task, file_shape):
        taskname = task['name']
        layout = task['layout']
        scales = task['scales']
        op = task['operator']
        virt_layout = h5py.VirtualLayout(shape=file_shape, dtype=op.dtype)

        for i in range(self.dist.comm_cart.size):
            file_name = '%s_s%i_p%i.h5' % (self.base_path.stem, self.set_num,
                                           i)
            folder_name = '%s_s%i' % (self.base_path.stem, self.set_num)
            folder_path = self.base_path.joinpath(folder_name)
            src_file_name = folder_path.joinpath(file_name).relative_to(
                self.base_path)
            gnc_shape, gnc_start, write_shape, write_start, write_count = self.get_write_stats(
                layout,
                scales,
                op.domain,
                op.tensorsig,
                index=0,
                virtual_file=True,
                rank=i)

            shape_stop = len(op.tensorsig) + 1
            src_shape = file_shape[slice(0, shape_stop)] + layout.local_shape(
                op.domain, scales, rank=i)
            start = gnc_start
            count = write_count

            spatial_slices = tuple(
                slice(s, s + c) for (s, c) in zip(start, count))

            slices = (slice(None), ) + spatial_slices
            maxshape = (None, ) + tuple(count)
            tname = 'tasks/{}'.format(taskname)

            vsource = h5py.VirtualSource(src_file_name,
                                         name=tname,
                                         shape=src_shape,
                                         maxshape=maxshape)
            virt_layout[slices] = vsource

        return virt_layout
Exemplo n.º 28
0
    def create_virtual_dataset(fdir, files, key):
        """ 
        construct a virtual dataset, containing multiple h5 files
        
        :param fdir: location of virtual dataset
        :param files: list of files (strings) to be added to virtual dataset
        :param key: key from .h5 files to add to virtual dataset
        """
        sh = h5py.File(files[0], 'r')[key].shape  # get the first ones shape.
        layout = h5py.VirtualLayout(shape=(len(files), ) + sh,
                                    dtype=np.float64)
        with h5py.File(fdir, 'w', libver='latest') as f:
            f.create_dataset("index", files)
            for i, filename in enumerate(files):
                vsource = h5py.VirtualSource(filename, key, shape=sh)
                layout[i, ] = vsource

            f.create_virtual_dataset(key, layout, fillvalue=0)
            f.close()
Exemplo n.º 29
0
    def make_virtual_ds(self):
        # Assemble virtual dataset
        layout = h5.VirtualLayout((4, 100), 'i4', maxshape=(4, None))

        for n in range(1, 5):
            filename = osp.join(self.tmpdir, "{}.h5".format(n))
            vsource = h5.VirtualSource(filename, 'data', shape=(100, ))
            # Fill the first half with positions 0, 2, 4... from the source
            layout[n - 1, :50] = vsource[0:100:2]
            # Fill the second half with places 1, 3, 5... from the source
            layout[n - 1, 50:] = vsource[1:100:2]

        outfile = osp.join(self.tmpdir, 'VDS.h5')

        # Add virtual dataset to output file
        with h5.File(outfile, 'w', libver='latest') as f:
            f.create_virtual_dataset('/group/data', layout, fillvalue=-5)

        return outfile
Exemplo n.º 30
0
    def create_virtual_layout(self, source_meta):
        """Create a list of VirtualMaps of raw data to the VDS.

        Args:
            source_meta(SourceMeta): Source attributes

        Returns:
            list(VirtualMap): Maps describing links between raw data and VDS

        """
        source_shape = source_meta.frames + \
            (source_meta.height, source_meta.width)
        spacing = self.construct_vds_spacing()
        target_height = source_meta.height * len(self.files) + sum(spacing)
        target_shape = source_meta.frames + (target_height, source_meta.width)
        self.logger.debug("VDS metadata:\n"
                          "  Shape: %s\n"
                          "  Spacing: %s", target_shape, spacing)

        v_layout = h5.VirtualLayout(target_shape, source_meta.dtype)

        current_position = 0
        for stripe_idx, file_path in enumerate(self.files):
            v_source = h5.VirtualSource(file_path,
                                        name=self.source_node,
                                        shape=source_shape,
                                        dtype=source_meta.dtype)

            start = current_position
            end = start + source_meta.height
            current_position = end + spacing[stripe_idx]

            # Hyperslab: All frames for each axis,
            #            Height bounds of stripe,
            #            Entire width
            v_layout[..., start:end, :] = v_source

            self.logger.debug("Mapping %s[..., %s:%s, :] to %s[...].",
                              self.name, start, end,
                              file_path.split("/")[-1])

        return v_layout