예제 #1
0
def gather_videos_vqp(fd: h5py.File):
    """Specified for VQP"""
    root = Path(FLAGS.input_dir)
    glob = FLAGS.glob or '*'
    inputs = sorted(root.glob(glob))
    candidates = set(i.parent for i in filter(lambda f: f.is_file(), inputs))
    frames_info = {}
    for p in tqdm.tqdm(candidates):
        seq = [
            Image.open(f)
            for f in filter(lambda f: f.is_file(), sorted(p.rglob('*')))
        ]
        cube = np.stack(seq)
        if FLAGS.data_format == 'channels_first':
            cube = cube.transpose([0, 3, 1, 2])
        cube = np.expand_dims(cube, 0)
        path = p.relative_to(root)
        # ugly
        path = path.parent / path.stem.split('_')[0]
        key = str(path.as_posix())
        if not key in fd:
            fd.create_dataset(key,
                              data=cube,
                              maxshape=(52, ) + cube.shape[1:],
                              compression=FLAGS.compression)
            frames_info[key] = len(seq)
        else:
            d = fd[key]
            cnt = d.shape[0] + 1
            d.resize(cnt, 0)
            d[-1] = cube
        del cube
예제 #2
0
class TestSimpleSlicing(TestCase):

    """
        Feature: Simple NumPy-style slices (start:stop:step) are supported.
    """

    def setUp(self):
        self.f = File(self.mktemp(), 'w')
        self.arr = np.arange(10)
        self.dset = self.f.create_dataset('x', data=self.arr)

    def tearDown(self):
        if self.f:
            self.f.close()

    def test_negative_stop(self):
        """ Negative stop indexes work as they do in NumPy """
        self.assertArrayEqual(self.dset[2:-2], self.arr[2:-2])

    def test_write(self):
        """Assigning to a 1D slice of a 2D dataset
        """
        dset = self.f.create_dataset('x2', (10, 2))

        x = np.zeros((10, 1))
        dset[:, 0] = x[:, 0]
        with self.assertRaises(TypeError):
            dset[:, 1] = x
예제 #3
0
파일: test_file.py 프로젝트: hyoklee/h5py
    def test_create_with_space_strategy(self):
        """ Create file with file space strategy """
        fname = self.mktemp()
        fid = File(fname,
                   'w',
                   fs_strategy="page",
                   fs_persist=True,
                   fs_threshold=100)
        self.assertTrue(fid)
        # Unable to set file space strategy of an existing file
        with self.assertRaises(ValueError):
            File(fname, 'a', fs_strategy="page")
        # Invalid file space strategy type
        with self.assertRaises(ValueError):
            File(self.mktemp(), 'w', fs_strategy="invalid")

        dset = fid.create_dataset('foo', (100, ), dtype='uint8')
        dset[...] = 1
        dset = fid.create_dataset('bar', (100, ), dtype='uint8')
        dset[...] = 1
        del fid['foo']
        fid.close()

        fid = File(fname, 'a')
        plist = fid.id.get_create_plist()
        fs_strat = plist.get_file_space_strategy()
        assert (fs_strat[0] == 1)
        assert (fs_strat[1] == True)
        assert (fs_strat[2] == 100)

        dset = fid.create_dataset('foo2', (100, ), dtype='uint8')
        dset[...] = 1
        fid.close()
예제 #4
0
def unite(batches, out_dir, out_name, batch_size):
    num_batches = len(batches)

    file = File(f'{out_dir}/{out_name}.h5', 'w')

    x_data = file.create_dataset('x_data',
                                 shape=(batch_size * num_batches, 299, 299, 3),
                                 dtype=np.uint8)
    x_data_adv = file.create_dataset('x_data_adv',
                                     shape=(batch_size * num_batches, 299, 299,
                                            3),
                                     dtype=np.uint8)
    y_data = file.create_dataset('y_data',
                                 shape=(batch_size * num_batches, 1000),
                                 dtype=np.float32)

    for i, batch in enumerate(batches):
        batch = File(batch, 'r')

        batch_x, batch_x_adv, batch_y = map(lambda x: batch[x],
                                            ('x_data', 'x_data_adv', 'y_data'))

        offset = i * batch_size
        x_data[offset:offset + batch_size] = batch_x
        x_data_adv[offset:offset + batch_size] = batch_x_adv
        y_data[offset:offset + batch_size] = batch_y

        batch.close()
        logger.info(f'{i+1}/{num_batches} batches for {out_name} repacked')

    file.close()
def add_vanhateren_subset(h5handle: h5py.File, prefix, imglist, imgbase):
    length_filename = len(imglist[0])
    for imgname in imglist:
        assert len(imgname) == length_filename
    imglist_dtype = 'S' + str(length_filename)

    imgarray_all = []
    for idx, imgname in enumerate(imglist):
        with open(os.path.join(imgbase, imgname), 'rb') as f:
            imgarray = np.fromfile(f, dtype='>u2').reshape((1024, 1536))
            assert imgarray.size == 1024 * 1536
            imgarray = imgarray[:, 2:-2]  # remove black stuff.
            imgarray_all.append(imgarray)
        if idx % 100 == 0:
            print(idx)
    data_to_write = np.asarray(imgarray_all)
    print(data_to_write.shape, data_to_write.dtype)
    h5handle.create_dataset(
        prefix + '_data',
        data=data_to_write,
        chunks=(1, 256, 1532),  # basically, make each chunk 1/4 of an image.
        compression='gzip',
        shuffle=True,
        fletcher32=True)
    h5handle.create_dataset(prefix + '_filelist',
                            data=np.array(imglist, dtype=imglist_dtype))
예제 #6
0
    def test_create_with_space_strategy(self):
        """ Create file with file space strategy """
        fname = self.mktemp()
        fid = File(fname,
                   'w',
                   fs_strategy="page",
                   fs_persist=True,
                   fs_threshold=100)
        self.assertTrue(fid)
        dset = fid.create_dataset('foo', (100, ), dtype='uint8')
        dset[...] = 1
        dset = fid.create_dataset('bar', (100, ), dtype='uint8')
        dset[...] = 1
        del fid['foo']
        fid.close()

        fid = File(fname, 'a')
        plist = fid.id.get_create_plist()
        fs_strat = plist.get_file_space_strategy()
        assert (fs_strat[0] == 1)
        assert (fs_strat[1] == True)
        assert (fs_strat[2] == 100)

        dset = fid.create_dataset('foo2', (100, ), dtype='uint8')
        dset[...] = 1
        fid.close()
예제 #7
0
def make_h5py_file(fp, num_entries, classes):
    f = File(fp, 'w')
    f.attrs['classes'] = np.array([x.encode() for x in classes])
    byte_type = special_dtype(vlen=np.dtype('uint8'))
    images_dset = f.create_dataset("images", (num_entries,), dtype=byte_type, maxshape=(None,))
    masks_dset = f.create_dataset("masks", (num_entries,), dtype=byte_type, maxshape=(None,))
    return images_dset, masks_dset, f
예제 #8
0
 def test_visit(self):
     fname = self.mktemp()
     fid = File(fname, 'w')
     fid.create_dataset('foo', (100, ), dtype='uint8')
     with pytest.raises(TestException, match='throwing exception'):
         fid.visititems(throwing)
     fid.close()
예제 #9
0
def write_contact_map(
    h5_file: h5py.File,
    rows: List[np.ndarray],
    cols: List[np.ndarray],
    vals: Optional[List[np.ndarray]] = None,
):

    # Helper function to create ragged array
    def ragged(data):
        a = np.empty(len(data), dtype=object)
        a[...] = data
        return a

    # list of np arrays of shape (2 * X) where X varies
    data = ragged([np.concatenate(row_col) for row_col in zip(rows, cols)])
    h5_file.create_dataset(
        "contact_map",
        data=data,
        dtype=h5py.vlen_dtype(np.dtype("int16")),
        fletcher32=True,
        chunks=(1,) + data.shape[1:],
    )

    # Write optional values field for contact map. Could contain CA-distances.
    if vals is not None:
        data = ragged(vals)
        h5_file.create_dataset(
            "contact_map_values",
            data=data,
            dtype=h5py.vlen_dtype(np.dtype("float32")),
            fletcher32=True,
            chunks=(1,) + data.shape[1:],
        )
def save_optimizer_state_dict(hf: h5py.File, state_dict: Optional[bytes]) -> None:
    if state_dict is None:
        return
    hf.create_dataset(
        OPTIMIZER_STATE_DICT_DATASET,
        data=np.frombuffer(state_dict, dtype=NP_VOID_DTYPE),
    )
예제 #11
0
def main():
    a = DetPulseCoord()
    fileid = h5f.create(b"test.h5")
    x = [1, 3, 3]
    y = [1., 3., 3, 4., 5, 3., 33.]
    x = ones((100, 3), dtype=int32)
    y = ones((100, 7), dtype=float32)
    z = ones((100, 2), dtype=float32)
    c = [(x[i], y[i], z[i]) for i in range(100)]
    data = {a.names[0]: x, a.names[1]: y}
    dspaceid = h5s.create_simple((1, ), (h5s.UNLIMITED, ))
    # dset = h5d.create(fileid, a.name, a.type, dspaceid)
    # dset.write()
    file = File("test.h5")
    numpytype = dtype([("coord", int32, (3, )), ("pulse", float32, (7, )),
                       ("EZ", float32, (2, ))])
    data = array(c, dtype=numpytype)
    tid = h5t.C_S1.copy()
    tid.set_size(6)
    H5T6 = Datatype(tid)
    tid.set_size(4)
    H5T_C_S1_4 = Datatype(tid)
    file.create_dataset("DetPulseCoord", data=data)
    file.attrs.create("CLASS", "TABLE", dtype=H5T6)
    file.attrs.create("FIELD_0_NAME", a.names[0])
    file.attrs.create("FIELD_1_NAME", a.names[1])
    file.attrs.create("TITLE", "Detpulse coord pair data")

    file.attrs.create("VERSION", "3.0", dtype=H5T_C_S1_4)
    file.attrs.create("abstime", 1.45e9, dtype=float64, shape=(1, ))
    file.attrs.create("nevents", 122421, dtype=float64, shape=(1, ))
    file.attrs.create("runtime", 125000, dtype=float64, shape=(1, ))
    file.flush()
예제 #12
0
def create_datasets_and_return(f: h5py.File):
    # https://docs.h5py.org/en/stable/high/dataset.html
    # RGB: 720 x 1280 x 4 color channels x num images
    timestampset = f.create_dataset(
        name="timestamp",
        shape=(1, 1),
        maxshape=(None, 1),
        dtype="int64"
    )

    colorset = f.create_dataset(
        name="color",
        shape=(1, RGBD_X, RGBD_Y, COLOR_Z),
        maxshape=(None, RGBD_X, RGBD_Y, COLOR_Z),
        compression="gzip",
        # Set this accordingly to trade off FPS and filesize
        compression_opts=0,
        dtype="uint8",
    )

    depthset = f.create_dataset(
        name="depth",
        shape=(1, RGBD_X, RGBD_Y),
        maxshape=(None, RGBD_X, RGBD_Y),
        compression="gzip",
        # compression_opts=9,
        dtype="uint16",
    )

    return colorset, depthset, timestampset
예제 #13
0
    def _preprocess_split(
            self, h5py_file: h5py.File, split_name: str,
            raw_split_resource: StreamedResource) -> StreamedResource:
        split_df = pd.read_csv(raw_split_resource, delimiter=",")
        targets_df = split_df["intent"]
        sample_texts_df = split_df["tokens"]

        embedder_instances = [WordEmbeddings("en-glove")]
        doc_embedders = DocumentPoolEmbeddings(embedder_instances)

        sample_location = os.path.join(split_name, "samples")
        target_location = os.path.join(split_name, "targets")
        embedding_size = sum(
            [embedder.embedding_length for embedder in embedder_instances])
        string_datatype = h5py.string_dtype(encoding='ascii')
        sample_dset = h5py_file.create_dataset(sample_location,
                                               shape=(
                                                   len(split_df),
                                                   embedding_size,
                                               ))
        target_dset = h5py_file.create_dataset(target_location,
                                               (len(split_df), ),
                                               dtype=string_datatype)
        for i in tqdm.tqdm(range(len(split_df)),
                           desc=f"Embedding {split_name} split"):
            doc_text = AtisPreprocessor._clean_text(sample_texts_df.iloc[i])
            embedded_doc = self._embed_document(doc_text, doc_embedders)
            sample_dset[i] = embedded_doc
            target_dset[i] = targets_df.iloc[i]
예제 #14
0
class TestSimpleSlicing(TestCase):

    """
        Feature: Simple NumPy-style slices (start:stop:step) are supported.
    """

    def setUp(self):
        self.f = File(self.mktemp(), 'w')
        self.arr = np.arange(10)
        self.dset = self.f.create_dataset('x', data=self.arr)

    def tearDown(self):
        if self.f:
            self.f.close()

    def test_negative_stop(self):
        """ Negative stop indexes work as they do in NumPy """
        self.assertArrayEqual(self.dset[2:-2], self.arr[2:-2])

    def test_write(self):
        """Assigning to a 1D slice of a 2D dataset
        """
        dset = self.f.create_dataset('x2', (10, 2))

        x = np.zeros((10, 1))
        dset[:, 0] = x[:, 0]
        with self.assertRaises(TypeError):
            dset[:, 1] = x
예제 #15
0
def write_point_cloud(h5_file: h5py.File, point_cloud: np.ndarray):
    h5_file.create_dataset(
        "point_cloud",
        data=point_cloud,
        dtype="float32",
        fletcher32=True,
        chunks=(1,) + point_cloud.shape[1:],
    )
예제 #16
0
def hdf5_writer(filename, data, components=None):
    """
    Write a dataset or a subset to a FITS file.

    Parameters
    ----------
    data : `~glue.core.data.Data` or `~glue.core.subset.Subset`
        The data or subset to export
    components : `list` or `None`
        The components to export. Set this to `None` to export all components.
    """

    if isinstance(data, Subset):
        mask = data.to_mask()
        data = data.data
    else:
        mask = None

    from h5py import File

    f = File(filename, 'w')

    for cid in data.main_components + data.derived_components:

        if components is not None and cid not in components:
            continue

        if data.get_kind(cid) == 'categorical':
            values = data[cid]
            if values.dtype.kind == 'U':
                values = np.char.encode(values,
                                        encoding='ascii',
                                        errors='replace')
            else:
                values = values.copy()
        else:
            values = data[cid].copy()

        if mask is not None:
            if values.ndim == 1:
                values = values[mask]
            else:
                if values.dtype.kind == 'f':
                    values[~mask] = np.nan
                elif values.dtype.kind == 'i':
                    values[~mask] = 0
                elif values.dtype.kind == 'S':
                    values[~mask] = ''
                else:
                    warnings.warn(
                        "Unknown data type in HDF5 export: {0}".format(
                            values.dtype))
                    continue

        f.create_dataset(cid.label, data=values)

    f.close()
예제 #17
0
def save_in(store: h5py.File, layer_outputs: List[np.ndarray], metrics: List[Metric], references: pd.DataFrame):
    """ Save batch data into HDF5 file. """
    for index, metric in enumerate(metrics):
        sample_id = len(references)
        references.loc[sample_id] = metric

        for output_index, batch_layer_outputs in enumerate(layer_outputs):
            layer_output = batch_layer_outputs[index]
            store.create_dataset(f'outputs/{output_index}/{sample_id}', data=layer_output)
예제 #18
0
파일: sefd.py 프로젝트: ska-sa/katsdpmodels
 def to_hdf5(self, hdf5: h5py.File) -> None:
     """"""
     hdf5.attrs['band'] = self.band
     if self.antenna is not None:
         hdf5.attrs['antenna'] = self.antenna
     if self.receiver is not None:
         hdf5.attrs['receiver'] = self.receiver
     hdf5.attrs['correlator_efficiency'] = self.correlator_efficiency
     hdf5.create_dataset('frequency', data=self.frequency, track_times=False)
     hdf5.create_dataset('coefs', data=self.coefs, track_times=False)
예제 #19
0
파일: fileTools.py 프로젝트: d-v-b/zebra
    def h5_writer(data, h5_path):
        from h5py import File
        from os.path import exists

        if exists(h5_path):
            remove(h5_path)

        f = File(h5_path, 'w')
        f.create_dataset('default', data=data, compression='gzip', chunks=True, shuffle=True)
        f.close()
예제 #20
0
def save_optimizer_state_dict(
    hf: h5py.File,
    state_dict: Optional[OptimizerStateDict],
) -> None:
    if state_dict is None:
        return
    with io.BytesIO() as fobj:
        torch.save(state_dict, fobj)
        hf.create_dataset(OPTIMIZER_STATE_DICT_DATASET,
                          data=np.frombuffer(fobj.getbuffer(), dtype=NP_VOID_DTYPE))
예제 #21
0
파일: hdf5.py 프로젝트: sergiopasra/glue
def hdf5_writer(filename, data, components=None):
    """
    Write a dataset or a subset to a FITS file.

    Parameters
    ----------
    data : `~glue.core.data.Data` or `~glue.core.subset.Subset`
        The data or subset to export
    components : `list` or `None`
        The components to export. Set this to `None` to export all components.
    """

    if isinstance(data, Subset):
        mask = data.to_mask()
        data = data.data
    else:
        mask = None

    from h5py import File

    f = File(filename, 'w')

    for cid in data.visible_components:

        if components is not None and cid not in components:
            continue

        comp = data.get_component(cid)
        if comp.categorical:
            if comp.labels.dtype.kind == 'U':
                values = np.char.encode(comp.labels, encoding='ascii', errors='replace')
            else:
                values = comp.labels.copy()
        else:
            values = comp.data.copy()

        if mask is not None:
            if values.ndim == 1:
                values = values[mask]
            else:
                if values.dtype.kind == 'f':
                    values[~mask] = np.nan
                elif values.dtype.kind == 'i':
                    values[~mask] = 0
                elif values.dtype.kind == 'S':
                    values[~mask] = ''
                else:
                    warnings.warn("Unknown data type in HDF5 export: {0}".format(values.dtype))
                    continue

        print(values)

        f.create_dataset(cid.label, data=values)

    f.close()
예제 #22
0
 def _save_as_hdf5_rec(cls, obj: Mapping[str, Union[Mapping, np.ndarray]], root: h5py.File):
     for k, v in obj.items():
         if isinstance(v, np.ndarray):
             root.create_dataset(name=k, data=v)
         elif isinstance(v, dict):
             grp = root.create_group(name=k)
             cls._save_as_hdf5_rec(v, grp)
         elif isinstance(v, Number):
             root.create_dataset(name=k, data=v)
         else:
             raise ValueError(f'Does not support type {type(v)}')
예제 #23
0
    def _save_values(self, file: h5py.File) -> None:
        """Save values needed to reproduce fit

        Args:
            file (h5py.File): Opened file to save to
        """
        file.create_dataset(
            self.stain_matrix_key,
            data=self.stain_matrix_target,
            compression="gzip",
            compression_opts=9,
        )
예제 #24
0
def write_datasubset(
    infile: h5py.File,
    outfile: h5py.File,
    mask: SWIFTMask,
    dataset_names: List[str],
    links_list: List[str],
):
    """
    Writes subset of all datasets contained in snapshot according to specified mask
    Parameters
    ----------
    infile : h5py.File
        hdf5 file handle for input snapshot
    outfile : h5py.File
        hdf5 file handle for output snapshot
    mask : SWIFTMask
        the mask used to define subset that is written to new snapshot
    dataset_names : list of str
        names of datasets found in the snapshot
    links_list : list of str
        names of links found in the snapshot
    """
    skip_list = links_list.copy()
    skip_list.extend(["Cells", "SubgridScheme"])
    if mask is not None:
        for name in dataset_names:
            if any([substr for substr in skip_list if substr in name]):
                continue

            # get output dtype and size
            first_value = infile[name][0]
            output_type = first_value.dtype
            output_size = first_value.size
            mask_size = get_dataset_mask(mask, name, suffix="_size")
            if output_size != 1:
                output_shape = (mask_size, output_size)
            else:
                output_shape = mask_size

            dataset_mask = get_dataset_mask(mask, name)

            subset = read_ranges_from_file(
                infile[name],
                dataset_mask,
                output_shape=output_shape,
                output_type=output_type,
            )

            # Write the subset
            outfile.create_dataset(name, data=subset)
            for attr_name, attr_value in infile[name].attrs.items():
                outfile[name].attrs.create(attr_name, attr_value)
예제 #25
0
    def _setup_hdf5(self, h5_file: h5py.File):
        """Sets up an HDF5 file to work as a database.

        Parameters
        ----------
        h5_file
            HDF5 file to set up. Must be opened in write mode.
        """
        if self.label_dtype is None:
            self.label_dtype = self._default_label_dtype
        if self.feature_dtype is None:
            self.feature_dtype = self._default_feature_dtype
        h5_file.create_dataset('features',
                               shape=(0, 0),
                               dtype=self.feature_dtype,
                               maxshape=(None, None))
        h5_file.create_dataset('labels',
                               shape=(0, 0, 0),
                               dtype=self.label_dtype,
                               maxshape=(None, None, None))
        h5_file.create_dataset('instance_ids',
                               shape=(0, ),
                               dtype=int,
                               maxshape=(None, ))
        h5_file.create_dataset('labeller_ids',
                               shape=(0, ),
                               dtype=int,
                               maxshape=(None, ))
        h5_file.attrs['label_dtype'] = self.label_dtype
        h5_file.attrs['feature_dtype'] = self.feature_dtype
        h5_file.attrs['n_features'] = -1
        h5_file.attrs['label_dim'] = -1
예제 #26
0
def compress_and_store(
    hd5: h5py.File,
    data: np.ndarray,
    hd5_path: str,
):
    """Support function that takes arbitrary input data in the form of a Numpy array
    and compress, store, and checksum the data in a HDF5 file.

    Args:
        hd5 (h5py.File): Target HDF5-file handle.
        data (np.ndarray): Data to be compressed and saved.
        hd5_path (str): HDF5 dataframe path for the stored data.
    """
    data = data.copy(order='C')  # Required for xxhash
    compressed_data = blosc.compress(data.tobytes(),
                                     typesize=2,
                                     cname='zstd',
                                     clevel=9)
    hash_uncompressed = xxhash.xxh128_digest(data)
    hash_compressed = xxhash.xxh128_digest(compressed_data)
    decompressed = np.frombuffer(blosc.decompress(compressed_data),
                                 dtype=np.uint16).reshape(data.shape)
    assert (xxhash.xxh128_digest(decompressed) == hash_uncompressed)
    dset = hd5.create_dataset(hd5_path, data=np.void(compressed_data))
    # Store meta data:
    # 1) Shape of the original tensor
    # 2) Hash of the compressed data
    # 3) Hash of the uncompressed data
    dset.attrs['shape'] = data.shape
    dset.attrs['hash_compressed'] = np.void(hash_compressed)
    dset.attrs['hash_uncompressed'] = np.void(hash_uncompressed)
    def serialize_samples(self, writer: h5py.File, data_file: str,
                          label_file: str):
        frames, labels, seq_num, num_samples, names = self._get_samples(
            data_file, label_file)

        # store data
        writer.create_dataset('audio', data=frames)
        writer.create_dataset('labels', data=labels)

        # Save meta-data
        writer.attrs['data_file'] = str(data_file)
        writer.attrs['label_file'] = str(label_file)
        writer.attrs['seq_num'] = seq_num
        writer.attrs['num_samples'] = num_samples
        #writer.attrs['label_names'] = names[1:]
        writer.attrs['label_names'] = names
예제 #28
0
def calculate_distances(name):
    index_ids, index_vectors = get_data('index', name)
    test_ids, test_vectors = get_data('test', name)
    logger.info('data is read')

    index_vectors, test_vectors = map(arr, (index_vectors, test_vectors))
    logger.info('tensors are ready')

    index_ids = index_ids
    test_ids = test_ids

    shape = len(test_ids), len(index_ids)

    file = File('data/distances.h5', 'w')
    result = file.create_dataset('result', shape=shape, dtype=np.uint8)
    logger.info('h5 file is ready')

    index_vectors = index_vectors.view(-1, SHAPE).cuda()
    for i in tqdm(np.arange(shape[0]), desc='calculating cosine'):
        c = cosine(test_vectors[i].view(-1, SHAPE), index_vectors)
        result[i, :] = c

    for i, v in tqdm(zip(index_ids, index_vectors),
                     desc='removing empty pics'):
        if v is None:
            result[:, i] = 255

    file.close()
예제 #29
0
    def _set_outputs(self, output_file: h5py.File,
                     outputs: Union[Tuple, Any]) -> None:
        """Save the step output to a given h5 file

        Args:
            output_file (h5py.File): File to write to
            outputs (Union[Tuple, Any]): Computed step output
        """
        if not isinstance(outputs, tuple):
            outputs = tuple([outputs])
        for i, output in enumerate(outputs):
            output_file.create_dataset(
                f"{self.output_key}_{i}",
                data=output,
                compression="gzip",
                compression_opts=9,
            )
예제 #30
0
파일: data.py 프로젝트: liufengbrain/ml4h
def write_in_hd5_ukbb(
    name: str,
    storage_type: StorageType,
    value: np.ndarray,
    hd5: h5py.File,
    compression: str,
):
    """Replicates storage behavior in tensor_writer_ukbb"""
    if storage_type == StorageType.STRING:
        hd5.create_dataset(name,
                           data=value,
                           dtype=h5py.special_dtype(vlen=str))
    elif storage_type == StorageType.CONTINUOUS:
        hd5.create_dataset(name, data=value, compression=compression)
    else:
        raise NotImplementedError(
            f'{storage_type} cannot be automatically written yet')
def compile_ace_h5(wav_loc, saveloc, ft='.wav', all_single_channel=False):
    """

    Create an HDF5 dataset which contains information about a set of files which describe AIRs of
    acoustic environments. This file can be used to train DNNs using ace_discriminative_nets.py

    Args:
        wav_loc: The location of the wav files as a list
        saveloc: The location to save to the HDF5 file
        ft: The file type to look for
        all_single_channel: Assume that all responses are single channel (faster and does not
        require soxi)

    Returns:
        Nothing

    """
    from utils_base import find_all_ft, run_command
    try:
        from os.path import abspath
    except ImportError:
        raise
    from h5py import File

    all_wavs = find_all_ft(wav_loc, ft=ft, use_find=True)
    channels = []
    for i in range(len(all_wavs)):
        print('Reading : ' + all_wavs[i])
        all_wavs[i] = abspath(all_wavs[i])
        if all_single_channel:
            channels.append('1')
        else:
            try:
                channels.append(run_command('soxi -c ' + all_wavs[i])[0])
            except OSError as ME:
                print(
                    'I think that soxi is not installed because when i tried to use it to get '
                    'the number of channels, i got this ' + ME.message)
                raise

    hf = File(saveloc, 'w')
    hf.create_dataset('filenames', data=all_wavs)
    hf.create_dataset('chan', data=channels)
    hf.close()
    print('Done with : ' + saveloc)
예제 #32
0
파일: fcs.py 프로젝트: burtonrj/CytoPy
def overwrite_or_create(file: h5py.File, data: np.ndarray, key: str):
    """
    Check if node exists in hdf5 file. If it does exist, overwrite with the given
    array otherwise create a new dataset.

    Parameters
    ----------
    file: h5py File object
    data: Numpy Array
    key: str

    Returns
    -------
    None
    """
    if key in file:
        del file[key]
    file.create_dataset(key, data=data)
예제 #33
0
def _gen_histogram(fd: h5py.File, bins: int):
    # Alright, this has been frustrating:

    # (I) I cannot use a locally scoped queue as pickle
    # is not able to pickle the locally bound function (why would
    # it even do that? It only has to pickle the return value).
    # (II) I cannot use a manager, because pytorch fails miserably
    # as for some reason it thinks it has to reinitialize
    # CUDA even though no tensors are ever used in the forked
    # process.
    # (III) I cannot switch the process start_method to 'spawn' because
    # it was already initialized by modules imported by this module.
    # Also: changing some global state which has side effect even
    # regarding other modules is probably a _very_ bad idea.
    # (IV) When using pathos as a replacement for multiprocessing
    # it spewed a whole new class of different errors and for the moment
    # I simply give that up and use global variables.

    # I hate pickle so much.

    minimum = fd['dists'].attrs['minimum']
    maximum = fd['dists'].attrs['maximum']

    rg = int(minimum - 1), int(maximum + 1)

    print('creating histogram in range [{}, {}]'.format(*rg))

    pool = mp.Pool()
    proc = mp.Process(target=_buffer_stats)
    proc.start()

    chunk_size = BUF_SIZE

    print()
    # it is not using map or async_map to be able to slow
    # down the reader on systems with little RAM
    for chunk in tqdm(range(fd['dists'].shape[0] // chunk_size)):
        a = chunk * chunk_size
        b = a + chunk_size
        dists = fd['dists'][a:b]
        pool.apply_async(_calc_stats, (dists, bins, rg))

    print('\n', 'waiting for workers to finish')
    pool.close()  # harbl
    pool.join()

    print('awaiting result')
    # feed some cyanide
    _shitq.put(None)
    proc.join()
    stats = _shitq.get()

    ds_hist = fd.create_dataset('histogram', stats.histogram.shape)
    ds_hist[:] = stats.histogram
    ds_hist.attrs['bin_edges'] = stats.bin_edges

    print('finished creating histogram')
예제 #34
0
def writeData(data, outputFilename):
    """
    Writes data to a tiff, hdf5, or npy file.

    Parameters
    ----------
    data : 3D numpy array
        The data to be written. Must have 3 dimensions, i.e. data.ndim == 3
    outputFilename : string
        The absolute or relative location of the particular file to be read
        in. outputFilename must end in one of the following extensions
        ['.tif', '.tiff', '.hdf5', '.h5', '.npy'].

    Notes
    -----
    - Data to be saved must be a 3D array.

    """

    assert data.ndim==3, "Can only write out 3D hdf5, tiff, and numpy files"
    filename = outputFilename.rstrip('/')
    basePath, fName = os.path.split(filename)
    name, ext = os.path.splitext(fName)
    if basePath and not os.path.exists(basePath):
        raise IOError, "Directory does not exist: %s" % (basePath)

    if ext.lower() in ['.npy']:
        try:
            np.save(filename, np.array(data,dtype=np.float32))
        except IOError:
            raise IOError, "Error writing npy data to: \"%s\"" % filename

    elif ext.lower() in ['.h5', '.hdf5']:
        from h5py import File
        try:
            h5File = File(filename, "w")
        except IOError:
            raise IOError, "Error creating writable hdf5 file at: \"%s\"" % filename

        shp = data.shape
        comp="gzip"
        compOpts=1
        dset = h5File.create_dataset("/raw", shp, np.float32, data, chunks=shp, compression=comp, compression_opts=compOpts)

    elif ext.lower() in ['.tif', '.tiff']:
        from libtiff import TIFF
        try:
            tiff = TIFF.open(filename, 'w')
            tiff.write_image(np.array(data,dtype=np.float32))
        except IOError:
            raise IOError, "Error writing tif file at: \"%s\"" % filename
        tiff.close()

    else:
        assert False, "Can only write out 3D hdf5, tiff, and numpy files"
예제 #35
0
def add_vanhateren_subset(h5handle: h5py.File, prefix, imglist, imgbase):
    length_filename = len(imglist[0])
    for imgname in imglist:
        assert len(imgname) == length_filename
    imglist_dtype = 'S' + str(length_filename)

    imgarray_all = []
    for idx, imgname in enumerate(imglist):
        with open(os.path.join(imgbase, imgname), 'rb') as f:
            imgarray = np.fromfile(f, dtype='>u2').reshape((1024, 1536))
            assert imgarray.size == 1024 * 1536
            imgarray = imgarray[:, 2:-2]  # remove black stuff.
            imgarray_all.append(imgarray)
        if idx % 100 == 0:
            print(idx)
    data_to_write = np.asarray(imgarray_all)
    print(data_to_write.shape, data_to_write.dtype)
    h5handle.create_dataset(prefix + '_data', data=data_to_write,
                            chunks=(1, 256, 1532),  # basically, make each chunk 1/4 of an image.
                            compression='gzip', shuffle=True, fletcher32=True)
    h5handle.create_dataset(prefix + '_filelist', data=np.array(imglist, dtype=imglist_dtype))
from mpi4py import MPI
from h5py import File
import numpy as np

rank = MPI.COMM_WORLD.Get_rank()
numProcs = MPI.COMM_WORLD.Get_size()
procsList = np.arange(numProcs)

def status(message, ranks=procsList):
    if rank in ranks:
        print "%s, process %d/%d: %s" % (time.strftime("%I:%M:%s%p"), rank + 1, numProcs, message)

def report(message):
    status(message, [0])

def reportbarrier(message):
    MPI.COMM_WORLD.Barrier()
    report(message)

fout = File(fname, "w", driver="mpio", comm=MPI.COMM_WORLD)
rows = fout.create_dataset("rows", (numRows, numCols), dtype=np.float64)

예제 #37
0
from h5py import File
import numpy as np

# about 3 GB
m = int(2e6) 
n = 200
k = 100
r = 20

W = np.random.random((m,r))
H = np.zeros((r, n))
H[:, :r] = np.eye(r)
H[:, r:] = np.random.random((r, n-r))
for i in np.arange(2, 20, 1):
    temp = H[:, i]
    H[:, i] = H[:, 10*i]
    H[:, 10*i] = temp

fout = File("testdata.h5", "w")
fout.create_dataset("mat", data=W.dot(H))
fout.close()



예제 #38
0
파일: storage.py 프로젝트: OpenWIM/pywim
def create_data_set(
    data_file: h5py.File,
    data: pd.DataFrame,
    sample_rate: int=None,
    date_time: datetime=datetime.now(),
    site_id: str='000',
    lane_id: str='00',
    temperature: float=None,
    license_plate: str=None,
    sensor_calibration_factory: list=None,
    distance_between_sensors: list=None,
    sensor_type: str=None,
    sensors_layout: str=None,
    channel_configuration: str=None,
    **kwargs
) -> h5py.Dataset:
    """

    :param data_file:
    :param data:
    :param sample_rate: (e.g. 2000)
    :param date_time: (e.g. 2017-49-04 00:49:36)
    :param site_id: (e.g. 001)
    :param lane_id: (e.g. 01)
    :param temperature: (e.g. 28.5)
    :param license_plate: (e.g. AAA9999)
    :param sensor_calibration_factory: (e.g. [0.98, 0.99, 0.75])
    :param distance_between_sensors: (e.g. [1.0, 1.5, 2.0])
    :param sensor_type: (e.g. quartz, polymer, ceramic, mixed)
    :param sensors_layout: (e.g. |/|\|<|>|=|)
    :param channel_configuration: (this is a, optional attribute, it is
        required just when sensor type is mixed,
        e.g. "{'a0': 'polymer', 'a1': 'ceramic'})"
    :param kwargs:
    :return:
    """

    dset_id = 'run_{}_{}_{}'.format(
        site_id, lane_id, date_time.strftime('%Y%M%d_%H%M%S')
    )

    dset = data_file.create_dataset(
        dset_id, shape=(data.shape[0],),
        dtype=np.dtype([
            (k, float) for k in ['index'] + list(data.keys())
        ])
    )

    dset['index'] = data.index

    for k in data.keys():
        dset[k] = data[k]

    dset.attrs['sample_rate'] = sample_rate
    dset.attrs['date_time'] = date_time.strftime('%Y-%M-%d %H:%M:%S')
    dset.attrs['site_id'] = site_id
    dset.attrs['lane_id'] = lane_id
    dset.attrs['temperature'] = temperature
    dset.attrs['license_plate'] = license_plate
    dset.attrs['sensor_calibration_factory'] = sensor_calibration_factory
    dset.attrs['distance_between_sensors'] = distance_between_sensors
    dset.attrs['sensor_type'] = sensor_type
    dset.attrs['sensors_layout'] = sensors_layout
    dset.attrs['channel_configuration'] = channel_configuration

    if kwargs:
        for k, v in kwargs.items():
            dset.attrs[k] = v

    return dset