def write_enum_datasets(f): data = np.arange(4) uint8_enum_type = h5py.enum_dtype({"RED": 0, "GREEN": 1, "BLUE": 2, "YELLOW": 3}, basetype=np.uint8) f.create_dataset("enum_uint8_data", data=data, dtype=uint8_enum_type) uint16_enum_type = h5py.enum_dtype({"RED": 0, "GREEN": 1, "BLUE": 2, "YELLOW": 3}, basetype=np.uint16) f.create_dataset("enum_uint16_data", data=data, dtype=uint16_enum_type) uint32_enum_type = h5py.enum_dtype({"RED": 0, "GREEN": 1, "BLUE": 2, "YELLOW": 3}, basetype=np.uint32) f.create_dataset("enum_uint32_data", data=data, dtype=uint32_enum_type) uint64_enum_type = h5py.enum_dtype({"RED": 0, "GREEN": 1, "BLUE": 2, "YELLOW": 3}, basetype=np.uint64) f.create_dataset("enum_uint64_data", data=data, dtype=uint64_enum_type) data = np.arange(4).reshape(2,2) uint8_enum_type = h5py.enum_dtype({"RED": 0, "GREEN": 1, "BLUE": 2, "YELLOW": 3}, basetype=np.uint8) f.create_dataset("2d_enum_uint8_data", data=data, dtype=uint8_enum_type) uint16_enum_type = h5py.enum_dtype({"RED": 0, "GREEN": 1, "BLUE": 2, "YELLOW": 3}, basetype=np.uint16) f.create_dataset("2d_enum_uint16_data", data=data, dtype=uint16_enum_type) uint32_enum_type = h5py.enum_dtype({"RED": 0, "GREEN": 1, "BLUE": 2, "YELLOW": 3}, basetype=np.uint32) f.create_dataset("2d_enum_uint32_data", data=data, dtype=uint32_enum_type) uint64_enum_type = h5py.enum_dtype({"RED": 0, "GREEN": 1, "BLUE": 2, "YELLOW": 3}, basetype=np.uint64) f.create_dataset("2d_enum_uint64_data", data=data, dtype=uint64_enum_type) f.flush() f.close()
def test_create(self): """ Enum datasets can be created and type correctly round-trips """ dt = h5py.enum_dtype(self.EDICT, basetype='i') ds = self.f.create_dataset('x', (100, 100), dtype=dt) dt2 = ds.dtype dict2 = h5py.check_enum_dtype(dt2) self.assertEqual(dict2, self.EDICT)
def write_spikes(filepath): population_names = ["default", "default2"] timestamps_base = (0.3, 0.1, 0.2, 1.3, 0.7) node_ids_base = (1, 2, 0, 0, 2) sorting_type = h5py.enum_dtype({"none": 0, "by_id": 1, "by_time": 2}) with h5py.File(filepath, "w") as h5f: h5f.create_group("spikes") gpop_spikes = h5f.create_group("/spikes/" + population_names[0]) gpop_spikes.attrs.create("sorting", data=2, dtype=sorting_type) timestamps, node_ids = zip( *sorted(zip(timestamps_base, node_ids_base))) gpop_spikes.create_dataset("timestamps", data=timestamps, dtype=np.double) gpop_spikes.create_dataset("node_ids", data=node_ids, dtype=np.uint64) gpop_spikes2 = h5f.create_group("/spikes/" + population_names[1]) gpop_spikes2.attrs.create("sorting", data=1, dtype=sorting_type) node_ids, timestamps = zip( *sorted(zip(node_ids_base, timestamps_base))) gpop_spikes2.create_dataset("timestamps", data=timestamps, dtype=np.double) gpop_spikes2.create_dataset("node_ids", data=node_ids, dtype=np.uint64)
def test_compound_vlen(self): vidt = h5py.vlen_dtype(np.uint8) eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8) for np_align in (False, True): dt = np.dtype([('a', eidt), ('foo', vidt), ('bar', vidt), ('switch', eidt)], align=np_align) np_offsets = [dt.fields[i][1] for i in dt.names] for logical in (False, True): if logical and np_align: # Vlen types have different size in the numpy struct self.assertRaises(TypeError, h5py.h5t.py_create, dt, logical=logical) else: ht = h5py.h5t.py_create(dt, logical=logical) offsets = [ ht.get_member_offset(i) for i in range(ht.get_nmembers()) ] if np_align: self.assertEqual(np_offsets, offsets)
def write_spikes(filepath): population_names = ['default', 'default2'] timestamps_base = (0.3, 0.1, 0.2, 1.3, 0.7) node_ids_base = (1, 2, 0, 0, 2) sorting_type = h5py.enum_dtype({"none": 0, "by_id": 1, "by_time": 2}) with h5py.File(filepath, 'w') as h5f: h5f.create_group('spikes') gpop_spikes = h5f.create_group('/spikes/' + population_names[0]) gpop_spikes.attrs.create('sorting', data=2, dtype=sorting_type) timestamps, node_ids = zip( *sorted(zip(timestamps_base, node_ids_base))) gpop_spikes.create_dataset('timestamps', data=timestamps, dtype=np.double) gpop_spikes.create_dataset('node_ids', data=node_ids, dtype=np.uint64) gpop_spikes2 = h5f.create_group('/spikes/' + population_names[1]) gpop_spikes2.attrs.create('sorting', data=1, dtype=sorting_type) node_ids, timestamps = zip( *sorted(zip(node_ids_base, timestamps_base))) gpop_spikes2.create_dataset('timestamps', data=timestamps, dtype=np.double) gpop_spikes2.create_dataset('node_ids', data=node_ids, dtype=np.uint64)
def main(): with h5py.File(file_path, 'w') as f: dataset = f.create_dataset('/group/dataset', shape=(3, 4), dtype='i') dataset[:] = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]] dataset.attrs['double'] = math.pi hello = '早上好!' dataset.attrs['string-vlen'] = hello hello_utf8 = hello.encode('utf-8') hello_ascii = 'Hello, world!' dataset.attrs.create('string-ascii', hello_ascii, None, '<S{0}'.format(len(hello_ascii))) utf8_type = h5py.string_dtype('utf-8', len(hello_utf8)) # HDFView can not display the value of this attribute correctly, ViTables can. dataset.attrs.create('string', hello_utf8, None, utf8_type) dataset.attrs['boolean'] = True color_dt = h5py.enum_dtype({ "RED": 0, "GREEN": 1, "BLUE": 42 }, basetype='i') dataset.attrs.create('color', 42, dtype=color_dt)
def test_readwrite(self): """ Enum datasets can be read/written as integers """ dt = h5py.enum_dtype(self.EDICT, basetype='i4') ds = self.f.create_dataset('x', (100, 100), dtype=dt) ds[35, 37] = 42 ds[1, :] = 1 self.assertEqual(ds[35, 37], 42) self.assertArrayEqual(ds[1, :], np.array((1,)*100, dtype='i4'))
def write_simple_attributes(self, group_object): # this is for all the types that can be attributes of a group or dataset in HDF5 # these simple hdf5 attributes can't have subgroups or datasets # iterate through all the key/values in _attributes and write to hdf5 # if a value is an enum then translate to the correct Enum class # if a value is a date, time - convert to character string per S100, section 10C-7 table 10C-1 # otherwise write as a simple attribute and simple type self._hdf5_path = group_object.name for key, val in self._attributes.items(): if isinstance(val, s1xx_sequence_types): continue # skip these types for now elif isinstance(val, S1xxWritesGroupObjects): continue # skip these types for now elif isinstance(val, S1xxObject): continue # skip these types for now elif isinstance(val, (datetime.date, datetime.datetime, datetime.time)): logging.debug(key + " datetime: {}", val) group_object.attrs[key] = val.isoformat() elif isinstance(val, Enum): logging.debug(key + " enumeration: " + str(val)) enum_as_dict = collections.OrderedDict( [[item.name, item.value] for item in type(val)]) int_type = numpy.uint8 try: # enum_dtype is added in h5py 2.10 enumtype = h5py.enum_dtype(enum_as_dict, int_type) except AttributeError: # special_dtype is for h5py <= 2.9 enumtype = h5py.special_dtype(enum=(int_type, enum_as_dict)) try: group_object.attrs.create(key, val.value, dtype=enumtype) except TypeError: # h5py isn't accepting OrderedDict, convert to dict try: enumtype = h5py.enum_dtype(dict(enum_as_dict), int_type) except AttributeError: enumtype = h5py.special_dtype( enum=(int_type, dict(enum_as_dict))) group_object.attrs.create(key, val.value, dtype=enumtype) else: logging.debug(key + " simple type: " + str(val)) group_object.attrs[key] = val
def write_params(output_path, pop_params_dict): output_pop_parameters = {} param_key_list = [] for population in pop_params_dict: this_pop_output_parameters = {} for gid in pop_params_dict[population]: this_gid_param_dicts = pop_params_dict[population][gid] this_output_params = {} for pd in this_gid_param_dicts: param_key = f'{pd["population"]}.{pd["source"]}.{pd["sec_type"]}.{pd["syn_name"]}.{pd["param_path"]}' param_val = pd["param_val"] param_key_list.append(param_key) this_output_params[param_key] = param_val this_pop_output_parameters[f'{gid}'] = this_output_params output_pop_parameters[population] = this_pop_output_parameters param_keys = set(param_key_list) output_file = h5py.File(output_path, 'a') param_mapping = {name: idx for (idx, name) in enumerate(param_keys)} parameters_grp = h5_get_group(output_file, 'Parameters') if 'parameters_type' not in parameters_grp: dt = h5py.enum_dtype(param_mapping, basetype=np.uint16) parameters_grp['parameter_enum'] = dt dt = np.dtype([("parameter", parameters_grp['parameter_enum']), ("value", np.float32)]) parameters_grp['parameters_type'] = dt for population in output_pop_parameters: pop_grp = h5_get_group(parameters_grp, population) this_pop_output_parameters = output_pop_parameters[population] for id_str in this_pop_output_parameters: this_output_params = this_pop_output_parameters[id_str] dset = h5_get_dataset( pop_grp, id_str, maxshape=(len(this_output_params), ), dtype=parameters_grp['parameters_type'].dtype) dset.resize((len(this_output_params), )) a = np.zeros(len(this_output_params), dtype=parameters_grp['parameters_type'].dtype) for idx, (parm, val) in enumerate(viewitems(this_output_params)): a[idx]["parameter"] = param_mapping[parm] a[idx]["value"] = val dset[:] = a output_file.close()
def write_group(self, dict, hf_group): for key, value in dict.items(): if isinstance(value, str): hf_group.attrs[key] = bytes(value, encoding="ascii") elif isinstance(value, (float, bool, int)): hf_group.attrs[key] = value elif (isinstance(value,list)): hf_group.create_dataset(key, data=value) elif type(value) is type(None): hf_group.attrs[key] = np.empty elif (value.__module__ == "alt_core" and "__members__" in dir (value)): dt = h5py.enum_dtype({key: item.__int__() for key, item in type(value).__members__.items()}, basetype='i') hf_group.attrs.create(key, value.__int__(), dtype=dt) else: pass
def test_vlen_enum(self): fname = self.mktemp() arr1 = [[1], [1, 2]] dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i')) with h5py.File(fname, 'w') as f: df1 = f.create_dataset('test', (len(arr1), ), dtype=dt1) df1[:] = np.array(arr1) with h5py.File(fname, 'r') as f: df2 = f['test'] dt2 = df2.dtype arr2 = [e.tolist() for e in df2[:]] self.assertEqual(arr1, arr2) self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)), h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
def test_vlen_enum(self): fname = self.mktemp() arr1 = [[1],[1,2]] dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i')) with h5py.File(fname,'w') as f: df1 = f.create_dataset('test', (len(arr1),), dtype=dt1) df1[:] = np.array(arr1) with h5py.File(fname,'r') as f: df2 = f['test'] dt2 = df2.dtype arr2 = [e.tolist() for e in df2[:]] self.assertEqual(arr1, arr2) self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)), h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
def write_spikes(filepath): population_names = ['All', 'spikes1', 'spikes2', 'empty'] timestamps_base = (0.3, 0.1, 0.2, 1.3, 0.7) node_ids_base = (3, 5, 2, 3, 2) sorting_type = h5py.enum_dtype({"none": 0, "by_id": 1, "by_time": 2}) string_dtype = h5py.special_dtype(vlen=get_vlen_str_type()) with h5py.File(filepath, 'w') as h5f: root = h5f.create_group('spikes') gpop_all = h5f.create_group('/spikes/' + population_names[0]) gpop_all.attrs.create('sorting', data=2, dtype=sorting_type) timestamps, node_ids = zip( *sorted(zip(timestamps_base, node_ids_base))) set = gpop_all.create_dataset('timestamps', data=timestamps, dtype=np.double) gpop_all.create_dataset('node_ids', data=node_ids, dtype=np.uint64) gpop_spikes1 = h5f.create_group('/spikes/' + population_names[1]) gpop_spikes1.attrs.create('sorting', data=1, dtype=sorting_type) node_ids, timestamps = zip( *sorted(zip(node_ids_base, timestamps_base))) gpop_spikes1.create_dataset('timestamps', data=timestamps, dtype=np.double) gpop_spikes1.create_dataset('node_ids', data=node_ids, dtype=np.uint64) gpop_spikes2 = h5f.create_group('/spikes/' + population_names[2]) gpop_spikes2.attrs.create('sorting', data=0, dtype=sorting_type) dtimestamps = gpop_spikes2.create_dataset('timestamps', data=timestamps_base, dtype=np.double) dtimestamps.attrs.create('units', data="ms", dtype=string_dtype) gpop_spikes2.create_dataset('node_ids', data=node_ids_base, dtype=np.uint64) gpop_empty = h5f.create_group('/spikes/' + population_names[3]) gpop_empty.attrs.create('sorting', data=1, dtype=sorting_type) dtimestamps = gpop_empty.create_dataset('timestamps', data=[], dtype=np.double) gpop_empty.create_dataset('node_ids', data=[], dtype=np.uint64)
def save_to_h5(self, hf): for k, v in self._tokens.items(): dtype = v['dtype'] data = v['data'] if dtype == 'int': hf.create_dataset(k, data=data) elif dtype == 'enum': mapping = dict((k, i) for i, k in enumerate(set(data))) assert len(mapping) <= 0xff dt = h5py.enum_dtype(mapping, basetype=np.uint8) hf.create_dataset(k, dtype=dt, data=[mapping[x] for x in data]) elif dtype == 'str': dt = h5py.string_dtype(encoding='utf8') strs = ['' if x is None else str(x) for x in data] hf.create_dataset(k, dtype=dt, data=[s.encode("utf8") for s in strs], compression='lzf') else: raise ValueError(dtype)
def test_compound_vlen_enum(self): eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8) vidt = h5py.vlen_dtype(np.uint8) def a(items): return np.array(items, dtype=np.uint8) f = self.f dt_vve = np.dtype([ ('foo', vidt), ('bar', vidt), ('switch', eidt)]) vve = f.create_dataset('dt_vve', shape=(2,), dtype=dt_vve) data = np.array([(a([1,2,3]), a([1,2]), 1), (a([]), a([2,4,6]), 0),], dtype=dt_vve) vve[:] = data actual = vve[:] self.assertVlenArrayEqual(data['foo'], actual['foo']) self.assertVlenArrayEqual(data['bar'], actual['bar']) self.assertArrayEqual(data['switch'], actual['switch'])
def test_compound_vlen_enum(self): eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8) vidt = h5py.vlen_dtype(np.uint8) def a(items): return np.array(items, dtype=np.uint8) f = self.f dt_vve = np.dtype([('foo', vidt), ('bar', vidt), ('switch', eidt)]) vve = f.create_dataset('dt_vve', shape=(2, ), dtype=dt_vve) data = np.array([ (a([1, 2, 3]), a([1, 2]), 1), (a([]), a([2, 4, 6]), 0), ], dtype=dt_vve) vve[:] = data actual = vve[:] self.assertVlenArrayEqual(data['foo'], actual['foo']) self.assertVlenArrayEqual(data['bar'], actual['bar']) self.assertArrayEqual(data['switch'], actual['switch'])
def test_compound_vlen(self): vidt = h5py.vlen_dtype(np.uint8) eidt = h5py.enum_dtype({'OFF': 0, 'ON': 1}, basetype=np.uint8) for np_align in (False, True): dt = np.dtype([ ('a', eidt), ('foo', vidt), ('bar', vidt), ('switch', eidt)], align=np_align) np_offsets = [dt.fields[i][1] for i in dt.names] for logical in (False, True): if logical and np_align: # Vlen types have different size in the numpy struct self.assertRaises(TypeError, h5py.h5t.py_create, dt, logical=logical) else: ht = h5py.h5t.py_create(dt, logical=logical) offsets = [ht.get_member_offset(i) for i in range(ht.get_nmembers())] if np_align: self.assertEqual(np_offsets, offsets)
def write_samples(group, dataset, class_dict, class_index, image_shape, chunk_size, num_chunks, compression, compression_opts): sampler = SamplerFactory().get( class_idxs=class_index, batch_size=chunk_size, n_batches=num_chunks, alpha=0.5, kind='fixed' ) len = chunk_size * num_chunks group.create_dataset('image', shape=(len, *dataset.image_shape), chunks=(chunk_size, *dataset.image_shape), dtype=dataset.image_dtype, compression=compression, compression_opts=compression_opts, shuffle=False ) group.create_dataset('label', shape=(len,), chunks=(chunk_size,), dtype=h5.enum_dtype(class_dict, basetype=np.int64), compression=compression, compression_opts=compression_opts, shuffle=False ) with Progress() as p: task = p.add_task(description=f'[red] writing {group.name}', total=num_chunks) for i, (image, cls) in enumerate(DataLoader(dataset, batch_sampler=sampler, num_workers=0)): offset = i * chunk_size group['image'][offset:offset + chunk_size] = image.numpy() group['label'][offset:offset + chunk_size] = cls.numpy() p.update(task, total=num_chunks, advance=1)
def h5_init_types(f, opt_id, feature_dtypes, constraint_names, param_names, problem_parameters, spec, metadata=None): opt_grp = h5_get_group(f, opt_id) param_keys = set(param_names) param_keys.update(problem_parameters.keys()) # create an HDF5 enumerated type for the parameter label param_mapping = {name: idx for (idx, name) in enumerate(param_keys)} feature_keys = None if feature_dtypes is not None: feature_keys = [feature_dtype[0] for feature_dtype in feature_dtypes] # create HDF5 types for features, if any feature_mapping = None if feature_keys is not None: feature_mapping = { name: idx for (idx, name) in enumerate(feature_keys) } constraint_mapping = None if constraint_names is not None: constraint_mapping = { name: idx for (idx, name) in enumerate(constraint_names) } objective_names = ['y'] objective_mapping = { name: idx for (idx, name) in enumerate(objective_names) } dt = h5py.enum_dtype(objective_mapping, basetype=np.uint16) opt_grp['objective_enum'] = dt dt = np.dtype({'names': objective_names, 'formats': [np.float32]}) opt_grp['objective_type'] = dt dt = np.dtype([("objective", opt_grp['objective_enum'])]) opt_grp['objective_spec_type'] = dt dset = h5_get_dataset(opt_grp, 'objective_spec', maxshape=(len(objective_names), ), dtype=opt_grp['objective_spec_type'].dtype) dset.resize((len(objective_names), )) a = np.zeros(len(objective_names), dtype=opt_grp['objective_spec_type'].dtype) for idx, parm in enumerate(objective_names): a[idx]["objective"] = objective_mapping[parm] dset[:] = a if feature_mapping is not None: dt = h5py.enum_dtype(feature_mapping, basetype=np.uint16) opt_grp['feature_enum'] = dt dt = np.dtype([("feature", opt_grp['feature_enum'])]) opt_grp['feature_spec_type'] = dt dt = np.dtype(feature_dtypes) opt_grp['feature_type'] = dt dset = h5_get_dataset(opt_grp, 'feature_spec', maxshape=(len(feature_keys), ), dtype=opt_grp['feature_spec_type'].dtype) dset.resize((len(feature_keys), )) a = np.zeros(len(feature_keys), dtype=opt_grp['feature_spec_type'].dtype) for idx, parm in enumerate(feature_keys): a[idx]["feature"] = feature_mapping[parm] dset[:] = a if constraint_mapping is not None: dt = h5py.enum_dtype(constraint_mapping, basetype=np.uint16) opt_grp['constraint_enum'] = dt dt = np.dtype([("constraint", opt_grp['constraint_enum'])]) opt_grp['constraint_spec_type'] = dt dt = np.dtype({'names': constraint_names, 'formats': [np.int8]}) opt_grp['constraint_type'] = dt dset = h5_get_dataset(opt_grp, 'constraint_spec', maxshape=(len(constraint_names), ), dtype=opt_grp['constraint_spec_type'].dtype) dset.resize((len(constraint_names), )) a = np.zeros(len(constraint_names), dtype=opt_grp['constraint_spec_type'].dtype) for idx, parm in enumerate(constraint_names): a[idx]["constraint"] = constraint_mapping[parm] dset[:] = a dt = h5py.enum_dtype(param_mapping, basetype=np.uint16) opt_grp['parameter_enum'] = dt dt = np.dtype([("parameter", opt_grp['parameter_enum']), ("value", np.float32)]) opt_grp['problem_parameters_type'] = dt dset = h5_get_dataset(opt_grp, 'problem_parameters', maxshape=(len(param_mapping), ), dtype=opt_grp['problem_parameters_type'].dtype) dset.resize((len(param_mapping), )) a = np.zeros(len(param_mapping), dtype=opt_grp['problem_parameters_type'].dtype) idx = 0 for idx, (parm, val) in enumerate(problem_parameters.items()): a[idx]["parameter"] = param_mapping[parm] a[idx]["value"] = val dset[:] = a dt = np.dtype([("parameter", opt_grp['parameter_enum']), ("is_integer", np.bool), ("lower", np.float32), ("upper", np.float32)]) opt_grp['parameter_spec_type'] = dt is_integer = np.asarray(spec.is_integer_variable, dtype=np.bool) upper = np.asarray(spec.upper, dtype=np.float32) lower = np.asarray(spec.lower, dtype=np.float32) dset = h5_get_dataset(opt_grp, 'parameter_spec', maxshape=(len(param_names), ), dtype=opt_grp['parameter_spec_type'].dtype) dset.resize((len(param_names), )) a = np.zeros(len(param_names), dtype=opt_grp['parameter_spec_type'].dtype) for idx, (parm, is_int, hi, lo) in enumerate(zip(param_names, is_integer, upper, lower)): a[idx]["parameter"] = param_mapping[parm] a[idx]["is_integer"] = is_int a[idx]["lower"] = lo a[idx]["upper"] = hi dset[:] = a dt = np.dtype({ 'names': param_names, 'formats': [np.float32] * len(param_names) }) opt_grp['parameter_space_type'] = dt
import os import h5py from pytz import timezone from astral import LocationInfo PROJECT_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../..") RAW_DATA_PATH = os.path.join(PROJECT_PATH, "data/raw/davos") DATASET_PATH = os.path.join(PROJECT_PATH, "data/datasets") MODELS_PATH = os.path.join(PROJECT_PATH, "pretrained_models") AVAILABLE_MODELS_FILE = os.path.join(PROJECT_PATH, "cloudseg/inference/models.yaml") TIMESTAMP_FORMAT_DAY = "%Y%m%d" TIMESTAMP_FORMAT_MINUTE = "%Y%m%d%H%M" TIMESTAMP_FORMAT = "%Y%m%d%H%M%S" PRETTY_FORMAT = "%d.%m.%Y %H:%M:%S" LOCATION = LocationInfo("Davos", "Switzerland", "Europe/Zurich", 46.813492, 9.844433) TIMEZONE = timezone("Europe/Zurich") LABEL_DATATYPE = h5py.enum_dtype({ "CLOUD": 0, "SKY": 1, "MASK": -1 }, basetype="i")
.hdf5 file containing two datasets. """ from pathlib import Path import sys import h5py import numpy as np # Create enumerated type for labels labels = [ "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC" ] label_dict = {l: i for i, l in enumerate(labels)} label_enum = h5py.enum_dtype(label_dict, basetype='i') label_map = np.vectorize(label_dict.get) DATASETS = ["testa", "testb", "train"] print(f"Will convert {DATASETS} to hdf5") for ds in DATASETS: print(f"Processing {ds}") representation = f"representation.{ds}.npy" true_labels = f"true_labels.{ds}.npy" out = f"{ds}.hdf5" with h5py.File(out, "w") as f: f.create_dataset("representation", data=np.load(representation)) f.create_dataset("true_labels",
def water_level_trend_dtype(self) -> Type[WaterLevelTrend]: """Define array datatype""" return h5py.enum_dtype( dict([(water_level_trend.name, water_level_trend.value) for water_level_trend in WaterLevelTrend]))
def write_compound_datasets(f): utf8 = h5py.special_dtype(vlen=str) gender_enum_dtype = h5py.enum_dtype({"MALE": 0, "FEMALE": 1}, basetype=np.uint8) dt = np.dtype([ ('firstName', utf8), # variable lentgh utf8 ('surname', 'S20'), # fixed length ASCII ('gender', gender_enum_dtype), # enum type ('age', np.uint8), # uint ('fav_number', np.float32), # float ('vector', np.float32, (3,))]) # array data = np.zeros(4, dtype=dt) # Set the example data data[0] = ('Bob', 'Smith', 0, 32, 1.0, [1, 2, 3]) data[1] = ('Peter', 'Fletcher', 0, 43, 2.0, [16.2, 2.2, -32.4]) data[2] = ('James', 'Mudd', 0, 12, 3.0, [-32.1,-774.1,-3.0]) data[3] = ('Ellie', 'Kyle', 1, 22, 4.0, [2.1,74.1,-3.8]) f.create_dataset('contiguous_compound', data=data) f.create_dataset('chunked_compound', data=data, chunks=(1,), compression="gzip") # 2d compound use img number example imgdt = np.dtype([ ('real', np.float32), ('img', np.float32) ]) data = np.zeros((3, 3), dtype=imgdt) data[0][0] = (2.3, -7.3) data[0][1] = (12.3, -17.3) data[0][2] = (-32.3, -0.3) data[1][0] = (2.3, -7.3) data[1][1] = (12.3, -17.3) data[1][2] = (-32.3, -0.3) data[2][0] = (2.3, -7.3) data[2][1] = (12.3, -17.3) data[2][2] = (-32.3, -0.3) f.create_dataset('2d_contiguous_compound', data=data) f.create_dataset('2d_chunked_compound', data=data, chunks=(1,2), compression="gzip") # Compound dataset containing ragged arrays uint8_vlen_type = h5py.vlen_dtype(np.uint8) compound_vlen_dtype = np.dtype([ ('one', uint8_vlen_type), ('two', uint8_vlen_type) ]) data = np.zeros(3, dtype=compound_vlen_dtype) data[0] = (np.array([1]), np.array([2])) data[1] = (np.array([1,1]), np.array([2,2])) data[2] = (np.array([1,1,1]), np.array([2,2,2])) f.create_dataset('vlen_contiguous_compound', data=data, dtype=compound_vlen_dtype) f.create_dataset('vlen_chunked_compound', data=data, dtype=compound_vlen_dtype, chunks=(1,), compression="gzip") # Compound dataset arrays of vlen type compound_vlen_dtype = np.dtype([ ('name', utf8, 2) ]) pointData = np.zeros(2, dtype=utf8) pointData[0] = "James" pointData[1] = "Ellie" data = np.zeros(1, dtype=compound_vlen_dtype) data['name'] = np.array(pointData) f.create_dataset('array_vlen_contiguous_compound', data=data, dtype=compound_vlen_dtype) f.create_dataset('array_vlen_chunked_compound', data=data, dtype=compound_vlen_dtype, chunks=(1,), compression="gzip") # Nested compound datasets use 2 img numbers as an example nested_dt = np.dtype([ ('firstNumber', imgdt), ('secondNumber', imgdt), ]) data = np.zeros(3, dtype=nested_dt) data[1] = ((1,1), (1,1)) data[2] = ((2,2), (2,2)) f.create_dataset('nested_contiguous_compound', data=data, dtype=nested_dt) f.create_dataset('nested_chunked_compound', data=data, dtype=nested_dt, chunks=(2,), compression="gzip") f.flush() f.close()
def main(): with h5py.File(file_path, 'w') as file: boolean_type = h5py.enum_dtype({"False": 0, "True": 1}, basetype='i') file.attrs.create('boolean', 0, None, boolean_type)