def h5_cmpd(): h = h5py.File('Brain_ONED_'+str(seed_no)+'.h5', 'a') maping_ex = {} maping_in = {} for ii in range(len(Me.spiketimes)): if len(Me[ii]) != 0: dset = h.create_dataset('/data/events/excitatory/spikes/' + str(ii), data=Me[ii]) dset.attrs.create('source', data='nrn_'+str(ii)) maping_ex['nrn_'+str(ii)] = '/data/events/excitatory/spikes/' + str(ii) for ii in range(len(Mi.spiketimes)): if len(Mi[ii]) != 0: dset = h.create_dataset('/data/events/inhibitory/spikes/' + str(ii), data=Mi[ii]) dset.attrs.create('source', data='nrn_'+str(ii)) maping_in['nrn_'+str(ii)] = '/data/events/inhibitory/spikes/' + str(ii) sp_type = np.dtype([('name', h5py.special_dtype(vlen=str)),('reference', h5py.special_dtype(vlen=str))]) m_ex = h.create_dataset('/map/events/excitatory/spikes', dtype=sp_type, shape=(len(maping_ex),)) m_in = h.create_dataset('/map/events/inhibitory/spikes', dtype=sp_type, shape=(len(maping_in),)) doh_ = 0 for ii,jj in maping_ex.items(): m_ex[doh_] = (ii, jj) doh_ += 1 doh_ = 0 for ii,jj in maping_in.items(): m_in[doh_] = (ii, jj) doh_ += 1 h.close()
def _check_data(self, data): """Check that the data provided by the dataset is valid. It is valid when it can be stored in a HDF5 using h5py. :param numpy.ndarray data: Data associated to the dataset :raises TypeError: In the case the data is not valid. """ if isinstance(data, (six.text_type, six.binary_type)): return chartype = data.dtype.char if chartype == "U": pass elif chartype == "O": d = h5py.special_dtype(vlen=data.dtype) if d is not None: return d = h5py.special_dtype(ref=data.dtype) if d is not None: return else: return msg = "Type of the dataset '%s' is not supported. Found '%s'." raise TypeError(msg % (self.name, data.dtype))
def WriteSimpleTypes(self): if len(self.simpleTypes) == 0: return maxLenTypeName = self._getMaxLength([x.name for x in self.simpleTypes]) maxLenQuantity = self._getMaxLength([x.quantity for x in self.simpleTypes]) # maxLenUnit = self._getMaxLength([x.unit for x in self.simpleTypes]) numpyDataType = numpy.dtype({'names': ['name', 'dataType', 'quantity', 'relativeQuantity', 'description', 'unitOrEnumerationRow', ], 'formats': ['S' + str(max(maxLenTypeName, 1)), h5py.special_dtype(enum=(numpy.uint8, DataType)), # 'uint8', 'S' + str(max(maxLenQuantity, 1)), h5py.special_dtype(enum=(numpy.uint8, {'false':0, 'true':1})), # 'uint8', 'S1', 'int32']}) dataset = self.description.create_dataset('SimpleTypes', (len(self.simpleTypes), 1), dtype=numpyDataType, maxshape=(len(self.simpleTypes), 1), compression='gzip') allData = [] for simpleType in self.simpleTypes: allData.append((simpleType.name, simpleType.dataType, simpleType.quantity, simpleType.relativeQuantity, '', simpleType.unitOrEnumerationRow)) dataset[:, 0] = allData
def read_digital_compound(filename, fieldname, tmax, all_pops, h): cell_range = [0,1000,1050,1140,1230,1320,1560,2360,2560,3060,3160,3260,3360,3460,3560] pop_names = ['pyrRS23','pyrFRB23','bask23','axax23','LTS23', 'spinste14', 'tuftIB5', 'tuftRS5', 'nontuftRS6', 'bask56', 'axax56', 'LTS56', 'TCR', 'nRT'] arr = pd.read_csv(filename, sep='\t', names=['times','cells']) u_cells = arr.cells.unique() #get names of cells that fired pop_cell_dict = {} #dict of dict for cell_name in u_cells: pop_idx = bisect.bisect(cell_range, cell_name) - 1 if pop_idx in all_pops: #only those populations of interest pop_name = pop_names[pop_idx] try: pop_cell_dict[pop_name][cell_name] = arr[(arr.cells == cell_name) & (arr.times <= tmax)].times.values except KeyError: pop_cell_dict[pop_name] = {cell_name:arr[(arr.cells == cell_name) & (arr.times <= tmax)].times.values} sp_type = np.dtype([('unique_id', h5py.special_dtype(vlen=str)),('path', h5py.special_dtype(vlen=str))]) for pop_name,cell_dicts in pop_cell_dict.iteritems(): #flush them into hdf5 ii = '/data/events/'+pop_name+'/spike_'+fieldname+'/' for cell_name, cell_value in cell_dicts.iteritems(): e_dset = h.create_dataset(ii+str(cell_name), dtype=np.float32, data=cell_value) e_dset.attrs.create('SOURCE', data=str(cell_name)) e_mset = h.create_dataset('/map/events/'+pop_name+'/spike_'+fieldname, dtype=sp_type, shape=(len(cell_dicts),)) for idx,cell_name in enumerate(cell_dicts.iterkeys()): e_mset[idx] = (str(cell_name), ii+'_'+str(cell_name)) #attach_to_all_under(h, 'events/'+pop_name, e_mset) print 'Done', filename return h
def _set_values_to_brick(self, brick_guid, brick_slice, values, value_slice=None): brick_file_path = os.path.join(self.brick_path, '{0}.hdf5'.format(brick_guid)) log.trace('Brick slice to fill: %s', brick_slice) log.trace('Value slice to extract: %s', value_slice) # Create the HDF5 dataset that represents one brick bD = tuple(self.brick_domains[1]) cD = self.brick_domains[2] if value_slice is not None: vals = values[value_slice] else: vals = values if values.ndim == 0 and len(values.shape) == 0 and np.iterable(vals): # Prevent single value strings from being iterated vals = [vals] # Check for object type data_type = self.dtype fv = self.fill_value # Check for object type if data_type == '|O8': if np.iterable(vals): vals = [pack(x) for x in vals] else: vals = pack(vals) if self.inline_data_writes: if data_type == '|O8': data_type = h5py.special_dtype(vlen=str) if 0 in cD or 1 in cD: cD = True with HDFLockingFile(brick_file_path, 'a') as f: # TODO: Due to usage concerns, currently locking chunking to "auto" f.require_dataset(brick_guid, shape=bD, dtype=data_type, chunks=None, fillvalue=fv) f[brick_guid][brick_slice] = vals else: work_key = brick_guid work = (brick_slice, vals) work_metrics = (brick_file_path, bD, cD, data_type, fv) log.trace('Work key: %s', work_key) log.trace('Work metrics: %s', work_metrics) log.trace('Work[0]: %s', work[0]) # If the brick file doesn't exist, 'touch' it to make sure it's immediately available if not os.path.exists(brick_file_path): if data_type == '|O8': data_type = h5py.special_dtype(vlen=str) if 0 in cD or 1 in cD: cD = True with HDFLockingFile(brick_file_path, 'a') as f: # TODO: Due to usage concerns, currently locking chunking to "auto" f.require_dataset(brick_guid, shape=bD, dtype=data_type, chunks=None, fillvalue=fv) if self.auto_flush: # Immediately submit work to the dispatcher self.brick_dispatcher.put_work(work_key, work_metrics, work) else: # Queue the work for later flushing self._queue_work(work_key, work_metrics, work)
def PostHDF5 (p, post_data): """Post data using the hdf5 interface""" # Build the url and then create a hdf5 object url = 'http://{}/{}/{}/hdf5/{}/{},{}/{},{}/{},{}/'.format(SITE_HOST, p.token, ','.join(p.channels), p.resolution, *p.args) tmpfile = tempfile.NamedTemporaryFile () fh5out = h5py.File ( tmpfile.name ) for idx, channel_name in enumerate(p.channels): chan_grp = fh5out.create_group(channel_name) chan_grp.create_dataset("CUTOUT", tuple(post_data[idx,:].shape), post_data[idx,:].dtype, compression='gzip', data=post_data[idx,:]) chan_grp.create_dataset("CHANNELTYPE", (1,), dtype=h5py.special_dtype(vlen=str), data=p.channel_type) chan_grp.create_dataset("DATATYPE", (1,), dtype=h5py.special_dtype(vlen=str), data=p.datatype) fh5out.close() tmpfile.seek(0) try: # Build a post request req = urllib2.Request(url, tmpfile.read()) import time start = time.time() response = urllib2.urlopen(req) print time.time()-start tmpfile.close() return response except urllib2.HTTPError,e: return e
def __setitem__(self,key,value): if key in self: del self[key] if type(value) == tuple: maxshape = (None,) + value[1:] if type(self.__backing) == dict: self.__backing[key] = np.ndarray( value, dtype='<U15') elif type(self.__backing) == h5py.File: dtype = (h5py.special_dtype(vlen=unicode) if key.lower() in ['id','name'] else 'float32' ) self.__backing.create_dataset( key, shape=value, maxshape=maxshape, dtype=dtype) elif type(value) == np.ndarray: if type(self.__backing) == dict: self.__backing[key] = value elif type(self.__backing) == h5py.File: dtype = (h5py.special_dtype(vlen=unicode) if key.lower() in ['id','name'] else 'float32' ) maxshape = (None,) + value.shape[1:] self.__backing.create_dataset( key, shape=value.shape, maxshape=maxshape, dtype=dtype) self.__backing[key][:] = value else: raise TypeError
def _main(args): voc_path = os.path.expanduser(args.path_to_voc) train_ids = get_ids(voc_path, train_set) val_ids = get_ids(voc_path, val_set) test_ids = get_ids(voc_path, test_set) train_ids_2007 = get_ids(voc_path, sets_from_2007) total_train_ids = len(train_ids) + len(train_ids_2007) # Create HDF5 dataset structure print('Creating HDF5 dataset structure.') fname = os.path.join(voc_path, 'pascal_voc_07_12.hdf5') voc_h5file = h5py.File(fname, 'w') uint8_dt = h5py.special_dtype( vlen=np.dtype('uint8')) # variable length uint8 vlen_int_dt = h5py.special_dtype( vlen=np.dtype(int)) # variable length default int train_group = voc_h5file.create_group('train') val_group = voc_h5file.create_group('val') test_group = voc_h5file.create_group('test') # store class list for reference class ids as csv fixed-length numpy string voc_h5file.attrs['classes'] = np.string_(str.join(',', classes)) # store images as variable length uint8 arrays train_images = train_group.create_dataset( 'images', shape=(total_train_ids, ), dtype=uint8_dt) val_images = val_group.create_dataset( 'images', shape=(len(val_ids), ), dtype=uint8_dt) test_images = test_group.create_dataset( 'images', shape=(len(test_ids), ), dtype=uint8_dt) # store boxes as class_id, xmin, ymin, xmax, ymax train_boxes = train_group.create_dataset( 'boxes', shape=(total_train_ids, ), dtype=vlen_int_dt) val_boxes = val_group.create_dataset( 'boxes', shape=(len(val_ids), ), dtype=vlen_int_dt) test_boxes = test_group.create_dataset( 'boxes', shape=(len(test_ids), ), dtype=vlen_int_dt) # process all ids and add to datasets print('Processing Pascal VOC 2007 datasets for training set.') last_2007 = add_to_dataset(voc_path, '2007', train_ids_2007, train_images, train_boxes) print('Processing Pascal VOC 2012 training set.') add_to_dataset( voc_path, '2012', train_ids, train_images, train_boxes, start=last_2007 + 1) print('Processing Pascal VOC 2012 val set.') add_to_dataset(voc_path, '2012', val_ids, val_images, val_boxes) print('Processing Pascal VOC 2007 test set.') add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes) print('Closing HDF5 file.') voc_h5file.close() print('Done.')
def h5ProjInfo ( proj, h5f ): """Populate the HDF5 file with project attributes""" projgrp = h5f.create_group ( 'PROJECT' ) projgrp.create_dataset("NAME", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.project_name) projgrp.create_dataset("HOST", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.host) projgrp.create_dataset("ND_VERSION", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.nd_version) projgrp.create_dataset("SCHEMA_VERSION", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.schema_version)
def h5ProjInfo ( proj, h5f ): """Populate the HDF5 file with project attributes""" projgrp = h5f.create_group ( 'PROJECT' ) projgrp.create_dataset("NAME", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.getProjectName()) projgrp.create_dataset("HOST", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.getDBHost()) projgrp.create_dataset("OCP_VERSION", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.getOCPVersion()) projgrp.create_dataset("SCHEMA_VERSION", (1,), dtype=h5py.special_dtype(vlen=str), data=proj.getSchemaVersion())
def testVlenReferenceDataItem(self): ref_dt = special_dtype(ref=Reference) dt = special_dtype(vlen=ref_dt) typeItem = hdf5dtype.getTypeItem(dt) typeSize = hdf5dtype.getItemSize(typeItem) self.assertEqual(typeItem['class'], 'H5T_VLEN') self.assertEqual(typeItem['size'], 'H5T_VARIABLE') baseItem = typeItem['base'] self.assertEqual(baseItem['base'], 'H5T_STD_REF_OBJ') self.assertEqual(typeSize, 'H5T_VARIABLE')
def input_data(self, value): if 'input_data' in self.h5group.keys(): self.h5group['input_data'] = value else: if isinstance(value,h5py.h5r.Reference): self.h5group.create_dataset('input_data', data=value, dtype=h5py.special_dtype(ref=h5py.Reference)) else: print value dset=self.h5group.create_dataset('input_data', (len(value),), dtype=h5py.special_dtype(ref=h5py.Reference)) for i,v in enumerate(value): dset[i]=v
def h5ProjInfo ( proj, h5f ): """Populate the HDF5 file with project attributes""" projgrp = h5f.create_group ( 'PROJECT' ) projgrp.create_dataset ( "NAME", (1,), dtype=h5py.special_dtype(vlen=str), data=proj._dbname ) projgrp.create_dataset ( "HOST", (1,), dtype=h5py.special_dtype(vlen=str), data=proj._dbhost ) projgrp.create_dataset ( "TYPE", (1,), dtype=np.uint32, data=proj._dbtype ) projgrp.create_dataset ( "DATASET", (1,), dtype=h5py.special_dtype(vlen=str), data=proj._dataset ) projgrp.create_dataset ( "DATAURL", (1,), dtype=h5py.special_dtype(vlen=str), data=proj._dataurl ) projgrp.create_dataset ( "READONLY", (1,), dtype=bool, data=(False if proj._readonly==0 else True)) projgrp.create_dataset ( "EXCEPTIONS", (1,), dtype=bool, data=(False if proj._exceptions==0 else True)) projgrp.create_dataset ( "RESOLUTION", (1,), dtype=np.uint8, data=proj._resolution)
def walk(dd, df): for key, value in dd.iteritems(): if isinstance(value, dict): try: dset = df[key] except: dset = df.require_group(key) walk(value, dset) else: if (type(value) is np.float) or \ (type(value) is np.int): try: dset = df[key] except KeyError: dset = df.require_dataset( key, (0, 1), type(value), maxshape=(None, 1), compression='lzf') dset.resize(dset.shape[0]+1, axis=0) dset[-1, 0] = value if (type(value) is np.str): try: dset = df[key] except KeyError: dt = h5py.special_dtype(vlen=unicode) dset = df.require_dataset( key, (0, 1), dt, maxshape=(None, 1), compression='lzf') dset.resize(dset.shape[0]+1, axis=0) dset[-1, 0] = value if type(value) is np.ndarray: if type(value[0]) is np.string_: dt = h5py.special_dtype(vlen=unicode) else: dt = np.float try: dset = df[key] except KeyError: dset = df.require_dataset( key, (0,)+value.shape, dt, maxshape=(None,)+value.shape, compression='lzf') dset.resize(dset.shape[0]+1, axis=0) dset[-1, ...] = value
def copyAttributes(inDs, outDs): for k in inDs.attrs.keys(): logging.debug("copying attribute: %s" % k) elt = inDs.attrs[k] if isinstance(elt, basestring): # h5py wants to simplify things down, so I think that this # is a possibility. newDtype = H5.special_dtype(vlen = str) elif elt.dtype == 'object': # this has to do with a numpy problem. newDtype = H5.special_dtype(vlen = str) else: newDtype = elt.dtype outDs.attrs.create(k, inDs.attrs[k], dtype = newDtype)
def WriteEnumerations(self): if len(self.enumerations) == 0: return numpyDataType = numpy.dtype({'names': ['name', 'value', 'description', 'firstEntry'], 'formats': [h5py.special_dtype(vlen=unicode),#'S' + str(max(maxLenName, 1)), 'int32', h5py.special_dtype(vlen=unicode),#'S' + str(max(maxLenDescription, 1)), h5py.special_dtype(enum=(numpy.uint8, {'false':0, 'true':1}))]}) # 'uint8']}) dataset = self.description.create_dataset('Enumerations', (len(self.enumerations), 1), dtype=numpyDataType, maxshape=(len(self.enumerations), 1), compression='gzip') allData = [] for enum in self.enumerations: allData.append((enum.name, enum.value, enum.description, enum.firstEntry)) dataset[:, 0] = allData
def parse_structure(key, group, value, _type, **kwds): try: # Here we check if there are any signals in the container, as # casting a long list of signals to a numpy array takes a very long # time. So we check if there are any, and save numpy the trouble if np.any([isinstance(t, BaseSignal) for t in value]): tmp = np.array([[0]]) else: tmp = np.array(value) except ValueError: tmp = np.array([[0]]) if tmp.dtype is np.dtype('O') or tmp.ndim is not 1: dict2hdfgroup(dict(zip( [str(i) for i in range(len(value))], value)), group.create_group(_type + str(len(value)) + '_' + key), **kwds) elif tmp.dtype.type is np.unicode_: group.create_dataset(_type + key, tmp.shape, dtype=h5py.special_dtype(vlen=str), **kwds) group[_type + key][:] = tmp[:] else: group.create_dataset( _type + key, data=tmp, **kwds)
def _save_hdf5_v2(self, filename, group = "Twiss"): # data type dt = np.dtype( [ ('element', h5py.special_dtype(vlen=bytes)), ('s', np.float64), ('alphax', np.float64), ('alphay', np.float64), ('betax', np.float64), ('betay', np.float64), ('etax', np.float64), ('etaxp', np.float64), ('etay', np.float64), ('etayp', np.float64), ('phix', np.float64), ('phiy', np.float64), ] ) data = np.ndarray((len(self.element),), dtype=dt) data['element'] = self.element for i,k in enumerate(self._cols): data[k] = [v[i] for v in self._twtable] f = h5py.File(filename) grp = f.create_group(group) grp['twtable'] = data grp['tune'] = np.array(self.tune) grp['chrom'] = np.array(self.chrom) grp['alphac'] = self.alphac f.close()
def test_vlen_bytes(self): """ Vlen bytes dataset maps to vlen ascii in the file """ dt = h5py.special_dtype(vlen=bytes) ds = self.f.create_dataset('x', (100,), dtype=dt) tid = ds.id.get_type() self.assertEqual(type(tid), h5py.h5t.TypeStringID) self.assertEqual(tid.get_cset(), h5py.h5t.CSET_ASCII)
def test_create(self): """ Enum datasets can be created and type correctly round-trips """ dt = h5py.special_dtype(enum=('i', self.EDICT)) ds = self.f.create_dataset('x', (100,100), dtype=dt) dt2 = ds.dtype dict2 = h5py.check_dtype(enum=dt2) self.assertEqual(dict2,self.EDICT)
def test_vlen_unicode(self): """ Vlen unicode dataset maps to vlen utf-8 in the file """ dt = h5py.special_dtype(vlen=unicode) ds = self.f.create_dataset('x', (100,), dtype=dt) tid = ds.id.get_type() self.assertEqual(type(tid), h5py.h5t.TypeStringID) self.assertEqual(tid.get_cset(), h5py.h5t.CSET_UTF8)
def __init__(self, output_name, output_dir, num_files, patches, feature_type, patch_dim=128, patch_type='uint8', pos_type='uint16'): self.log = get_logger() output_subdir = output_dir try: makedirs(output_subdir) except: pass output_filename = join(output_subdir, basename(output_name)) self.log.debug('Saving extracted descriptors to %s', output_filename) self.mode = 'creating' dt = special_dtype(vlen=bytes) patches += 10 #for safety self.hfile = HDF5File(output_filename, 'w', compression='gzip', fillvalue=0.0) self.patches = self.hfile.create_dataset('patches', (num_files * patches, patch_dim), dtype=patch_type, chunks=True) self.positions = self.hfile.create_dataset('positions', (num_files * patches, 2), dtype=pos_type, chunks=True) self.image_index = self.hfile.create_dataset('image_index', (num_files, 2), dtype='uint64') # Start, End positions of an image self.keys = self.hfile.create_dataset('keys', (num_files, ), dtype=dt) self.key_set = set() self.patches.attrs['cursor'] = 0 self.patches.attrs['feature_type'] = feature_type self.output_filename = output_filename
def make_vlen_dataset(source): # Create a variable-length 1D dataset dtype = h5py.special_dtype(vlen=numpy.dtype(source_dtypes[source])) dataset = h5file.create_dataset( source, (num_examples,), dtype=dtype) # Create a dataset to store variable-length shapes. axis_labels = source_axis_labels[source] dataset_shapes = h5file.create_dataset( '{}_shapes'.format(source), (num_examples, len(axis_labels)), dtype='uint16') # Create a dataset to store labels for variable-length axes. dataset_vlen_axis_labels = h5file.create_dataset( '{}_vlen_axis_labels'.format(source), (len(axis_labels),), dtype='S{}'.format( numpy.max([len(label) for label in axis_labels]))) # Fill variable-length axis labels dataset_vlen_axis_labels[...] = [ label.encode('utf8') for label in axis_labels] # Attach auxiliary datasets as dimension scales of the # variable-length 1D dataset. This is in accordance with the # H5PYDataset interface. dataset.dims.create_scale(dataset_shapes, 'shapes') dataset.dims[0].attach_scale(dataset_shapes) dataset.dims.create_scale(dataset_vlen_axis_labels, 'shape_labels') dataset.dims[0].attach_scale(dataset_vlen_axis_labels) # Tag fixed-length axis with its label dataset.dims[0].label = 'batch'
def dump_unicode(obj, h5f, compression=None): """ dumps a list object to h5py file""" dt = h5.special_dtype(vlen=unicode) ll = len(obj) dset = h5f.create_dataset('data', shape=(ll, ), compression=compression, dtype=dt) dset[:ll] = obj h5f.create_dataset('type', data=['unicode'])
def _createDatasetInFile(self, hdf5File, datasetName, roi): shape = tuple(roi[1] - roi[0]) chunks = self._description.chunks if chunks is not None: # chunks must not be bigger than the data in any dim chunks = numpy.minimum(chunks, shape) chunks = tuple(chunks) compression = self._description.compression compression_opts = self._description.compression_opts dtype = self._description.dtype if dtype == object: dtype = h5py.special_dtype(vlen=numpy.uint8) dataset = hdf5File.create_dataset( datasetName, shape=shape, dtype=dtype, chunks=chunks, compression=compression, compression_opts=compression_opts, ) # Set data attributes if self._description.drange is not None: dataset.attrs["drange"] = self._description.drange if _use_vigra: dataset.attrs["axistags"] = vigra.defaultAxistags(str(self._description.axes)).toJSON()
def get_attribute_types(fname): if not h5py.is_hdf5(fname): return "" types=set() dt = h5py.special_dtype(vlen=str) try: h5 = h5py.File(fname, 'r') have_type = '/data_descr/types' in h5 all_types = set(h5['/data_descr/types']) for o in h5['/data_descr/ordering']: indptr_name='/data/' + o + '_indptr' indices_name='/data/' + o + '_indices' if indptr_name in h5 and indices_name in h5: types += 'Sparse Matrix' else: if have_type and o in all_types: types += h5['/data_descr/types'][o] else: t=h5['/data/' + o].dtype if t==dt: types.add("String") elif t in (numpy.int64, numpy.int32): types.add("Integer") elif t in (numpy.float64, numpy.float32): types.add("Floating Point") else: types.add(str(t)) h5.close() except: pass return ','.join(list(types))
def test_create_array_string(self): file_name = get_temp_file() with h5py.File(file_name) as loc: shape = (32,) a = np.zeros(shape, dtype=np.dtype('|S3')) path = '/string/32' msg = create_array(loc, path, a) self.assertEqual(msg, path) shape = (32,1) a = np.zeros(shape, dtype=np.dtype('|U2')) path = '/string/32 x 1' msg = create_array(loc, path, a) self.assertEqual(msg, path) shape = (1, 32) a = np.zeros(shape, dtype=h5py.special_dtype(vlen=str)) path = '/string/1 x 32' msg = create_array(loc, path, a) self.assertEqual(msg, path) shape = (8, 16) a = np.zeros(shape, dtype=np.dtype('|S7')) path = '/float/8 x 16' msg = create_array(loc, path, a) self.assertEqual(msg, path) shape = (8, 16, 4) a = np.zeros(shape, dtype=np.dtype('|U8')) path = '/float/8 x 16 x 4' msg = create_array(loc, path, a) self.assertEqual(msg, path)
def _write_arrays(group, name, data, parent=None): grefs = group.create_group('_refs_{}'.format(name)) ref_dtype = _h5.special_dtype(ref=_h5.Reference) dname = group.create_dataset(name, (_np.size(data),), dtype=ref_dtype) # ====================================== # Create datasets # ====================================== for i, array in enumerate(data): if array.dtype == _np.dtype(object): # ====================================== # If dataset can't be created, nest # ====================================== darray = _write_arrays(grefs, '{}'.format(i), array, parent=name) else: darray = grefs.create_dataset(name='{}'.format(i), data=array, shape=_np.shape(array), compression="gzip") # ====================================== # Store reference in dataset # ====================================== dname[i] = darray.ref # if parent == 'hist': # pdb.set_trace() # ====================================== # Return created dataset # ====================================== return dname
def dump_unicode(obj, h5f, **kwargs): """ dumps a list object to h5py file""" dt = h5.special_dtype(vlen=unicode) ll = len(obj) dset = h5f.create_dataset('data', shape=(ll, ), dtype=dt, **kwargs) dset[:ll] = obj h5f.create_dataset('type', data=['unicode'])
def get_state(self, state): """Saves the vocabulary in a network state file. If there already is a vocabulary in the state, it will be replaced, so it has to have the same number of words. :type state: h5py.File :param state: HDF5 file for storing the neural network parameters """ h5_vocabulary = state.require_group('vocabulary') if 'words' in h5_vocabulary: state['words'][:] = self.id_to_word else: str_dtype = h5py.special_dtype(vlen=str) h5_vocabulary.create_dataset('words', data=self.id_to_word, dtype=str_dtype) if 'classes' in h5_vocabulary: state['classes'][:] = self.word_id_to_class_id else: h5_vocabulary.create_dataset('classes', data=self.word_id_to_class_id) probs = [self._word_classes[class_id].get_prob(word_id) for word_id, class_id in enumerate(self.word_id_to_class_id)] if 'probs' in h5_vocabulary: state['probs'][:] = probs else: h5_vocabulary.create_dataset('probs', data=probs)
def CifarAnalysis(folderName=None, batchsize=1000, **kwd): id_gpu = 0 OutStr = "" OutStr += 'GPU: {}\n'.format(id_gpu) OutStr += 'Minibatch-size: {}\n'.format(batchsize) OutStr += 'kwd: {}\n'.format(kwd) OutStr += '' print OutStr fOutput = None if folderName: if not os.path.exists(folderName): os.makedirs(folderName) fOutput = open(os.path.join(folderName, "output.dat"), "w") shutil.copyfile(__file__, os.path.join(folderName, os.path.basename(__file__))) # Prepare dataset data_tr = np.zeros((50000, 3 * 32 * 32), dtype=np.float32) data_ev = np.zeros((10000, 3 * 32 * 32), dtype=np.float32) label_tr = np.zeros((50000), dtype=np.int32) label_ev = np.zeros((10000), dtype=np.int32) I_colors = 3 I_Xunit = 32 I_Yunit = 32 F_unit = 100 # be careful!! h5f_tr = h5py.File("data_cifar100/train.h5f", "r") data_tr[:] = h5f_tr["ZCA_byTrainData/data"].value label_tr[:] = h5f_tr["Info/fine_labels"].value h5f_ev = h5py.File("data_cifar100/test.h5f", "r") data_ev[:] = h5f_ev["ZCA_byTrainData/data"].value label_ev[:] = h5f_ev["Info/fine_labels"].value ## Prep x_tr = data_tr.reshape((len(data_tr), 3, 32, 32)) x_ev = data_ev.reshape((len(data_ev), 3, 32, 32)) y_tr = label_tr y_ev = label_ev N_tr = len(data_tr) # 50000 N_ev = len(data_ev) # 10000 ag = Augument.Augumentation() ## Define analisis Resume = None if "Resume" in kwd: Resume = kwd["Resume"] del kwd["Resume"] model, ModelKwd = net.GenModel(I_colors=I_colors, I_Xunit=I_Xunit, I_Yunit=I_Yunit, F_unit=F_unit, **kwd) if id_gpu >= 0: cuda.get_device(id_gpu).use() model.to_gpu() xp = np if id_gpu < 0 else cuda.cupy # Setup optimizer optimizer = optimizers.Adam() optimizer.setup(model) # Init/Resume if Resume: print 'Load optimizer state from %s' % (Resume) with h5py.File(Resume, "r") as f: s = HDF5Deserializer(f) s_model = s["model"] s_model.load(model) # Setup stop manager sm = StopManager.StopManager() sm.SetMaximumEpoch(10000) sm.SetMinimumEpoch(10) sm.SetStopThreshold(3e-4) print sm # Learning loop if fOutput: fOutput.write("epoch,mode,loss,accuracy\n") #for epoch in six.moves.range(1, n_epoch + 1): epoch = 0 while True: epoch += 1 print 'epoch %d' % epoch # training perm = np.random.permutation(N_tr) sum_accuracy = 0 sum_loss = 0 start = time.time() for i in six.moves.range(0, N_tr, batchsize): bx = x_tr[perm[i:i + batchsize]] #if epoch>10: bx = ag.Aug(bx) #print bx[0] #bx = ag.Aug(bx) #print bx[0] #raw_input() x = chainer.Variable(xp.asarray(bx)) t = chainer.Variable(xp.asarray(y_tr[perm[i:i + batchsize]])) # Pass the loss function (Classifier defines it) and its arguments model.predictor.setTrainMode(True) optimizer.update(model, x, t) if (epoch == 1 and i == 0) and folderName: with open(os.path.join(folderName, 'graph.dot'), 'w') as o: g = computational_graph.build_computational_graph( (model.loss, )) o.write(g.dump()) print 'graph generated' sum_loss += float(model.loss.data) * len(t.data) sum_accuracy += float(model.accuracy.data) * len(t.data) end = time.time() elapsed_time = end - start throughput = N_tr / elapsed_time print 'train mean loss=%.5f, accuracy=%.2f%%, throughput=%.0f images/sec' % ( sum_loss / N_tr, sum_accuracy / N_tr * 100., throughput) if fOutput: fOutput.write("%d,Train,%e,%e\n" % (epoch, sum_loss / N_tr, sum_accuracy / N_tr)) # evaluation perm = np.random.permutation(N_ev) sum_accuracy = 0 sum_loss = 0 for i in six.moves.range(0, N_ev, batchsize): x = chainer.Variable(xp.asarray(x_ev[perm[i:i + batchsize]]), volatile='on') t = chainer.Variable(xp.asarray(y_ev[perm[i:i + batchsize]]), volatile='on') model.predictor.setTrainMode(False) loss = model(x, t) sum_loss += float(loss.data) * len(t.data) sum_accuracy += float(model.accuracy.data) * len(t.data) print 'test mean loss=%.5f, accuracy=%.2f%%' % ( sum_loss / N_ev, sum_accuracy / N_ev * 100, ) sm.AddAccuracy(sum_accuracy / N_ev) print sm.GetInfo() if fOutput: fOutput.write("%d,Test,%e,%e\n" % (epoch, sum_loss / N_ev, sum_accuracy / N_ev)) StopFlag = sm.StopCheck() if folderName and (epoch % 1 == 0 or StopFlag): # Save the model and the optimizer if StopFlag: myFname = os.path.join(folderName, 'mlp_final') else: myFname = os.path.join(folderName, 'mlp_%d' % epoch) with h5py.File(myFname + ".hdf5", "w") as f: s = HDF5Serializer(f) s["model"].save(model) f.create_dataset("kwd", data=ModelKwd.__str__(), dtype=h5py.special_dtype(vlen=unicode)) f.create_dataset("net", data=netFile, dtype=h5py.special_dtype(vlen=unicode)) f.flush() if StopFlag: break if fOutput: fOutput.close()
def processNMostCommon(N=3, wavdirpath=PATH_TRAIN_IN_16KWAVS, xmlpicklepath=PATH_TEST_OUT_XMLPICKLEFILE, todirrootpath=PATH_TEST_OUT_HDF5): global spectrogramWindowLength if not os.path.exists(todirrootpath): os.makedirs(todirrootpath) spectrogramHeight = 200 f = h5py.File( os.path.join(todirrootpath, "data_top{}_nozero.hdf5".format(N)), "w") dsetX = f.create_dataset( 'X', (0, 1, spectrogramHeight, spectrogramWindowLength), maxshape=(None, 1, spectrogramHeight, spectrogramWindowLength)) dsety = f.create_dataset('y', (0, N), maxshape=(None, N)) dsetMediaId = f.create_dataset('MediaId', (0, 1), maxshape=(None, 1)) dsetClassId = f.create_dataset('ClassId', (0, 1), maxshape=(None, 1), dtype=h5py.special_dtype(vlen=unicode)) import pickle df = pd.read_pickle(xmlpicklepath) # read the metadata # if we would like to keep recordings with a given quality than we can do it here by uncommenting the next line #df = filterByQuality(df, 0, 3) df["OFGS"] = df.apply(mergeOFGS, axis=1) # merge Order, Family, Genus, Species df_mc = getMostCommon(df, N) # get N most common classes from the dataset df = None # let GC free up some memory print("Metadata loaded") # Shuffle rows df_mc = df_mc.iloc[np.random.permutation(len(df_mc))] df_mc.reset_index(drop=True, inplace=True) (lb, binaryLabels) = getOneHotClassId(df_mc) # generate one-hot labels pickle.dump( lb, open( os.path.join(todirrootpath, "labelBinarizer_top{}.pickle".format(N)), 'wb')) # process the selected files of top N classes and save the data into HDF5 fileRanges = np.hstack((np.arange(0, len(df_mc), 30), len(df_mc))) for i in range(len(fileRanges) - 1): tempSG = wavsToSpectrogramByList( wavdirpath, df_mc.FileName[fileRanges[i]:fileRanges[i + 1]], dontFilter=False) X, y, fn, cIds = spectrogramListToT4(tempSG, \ binaryLabels[fileRanges[i]: fileRanges[i+1]], \ filenames = df_mc.MediaId[fileRanges[i]: fileRanges[i+1]].values, N=spectrogramWindowLength, \ classIds = df_mc.ClassId[fileRanges[i]: fileRanges[i+1]].values) #convert to t4 pre_len = dsetX.shape[0] add_len = X.shape[0] dsetX.resize(pre_len + add_len, axis=0) dsety.resize(pre_len + add_len, axis=0) dsetMediaId.resize(pre_len + add_len, axis=0) dsetClassId.resize(pre_len + add_len, axis=0) dsetX[pre_len:pre_len + add_len, :, :, :] = X dsety[pre_len:pre_len + add_len, :] = y dsetMediaId[pre_len:pre_len + add_len, :] = np.transpose( [[int(i) for i in fn]]) dsetClassId[pre_len:pre_len + add_len, :] = np.transpose( [[s.encode('utf8') for s in cIds]]) f.flush() f.close return (X, y, fn) # return last batch for debug purposes
#print(token_ids, len(token_ids)) tokens_tensor = torch.tensor([token_ids]) token_type_tensor = torch.LongTensor([[0] * len(tokens_a_delim)]) #print(token_type_tensor) _, _, attn_data_list = model(tokens_tensor, token_type_ids=token_type_tensor) attn_tensor = torch.stack( [attn_data['attn_probs'] for attn_data in attn_data_list]) attention[sent] = attn_tensor.data.numpy() L = len(sentences) sent_id = [] attentions = [] for idx in attention: sent_id.append(idx) attentions.append(attention[idx]) f = h5py.File('attn.h5', 'w') dt = h5py.special_dtype(vlen=np.dtype('float64', 'float64')) dataset = f.create_dataset('vlen', ( L, 12, 1, 12, ), dtype=dt) dataset.value for i in range(len(attentions)): dataset[i] = attentions[i] dataset.value f.close()
def create_target(df, prefix): print(f'There are {len(df)} bounding boxes matched in {prefix}...') # setting path to data datapath = os.path.join(modelpath, f"{prefix}_data_300_vgg.h5") # sorting all df df = df.sort_values('ImageID') # open file and maintain it opened f = h5py.File(datapath, 'w') try: # get the first image img = df.iloc[0] # setting initial states to iterate over all dataframe img_name = img[0] img_path = img[7] images = [] target = [img[13:].tolist() + img[9:13].tolist()] # iterate over all data set for i, img in tqdm(enumerate(df.iloc[:, :].itertuples())): # in first iteration of each batch size create the group # save last image when the new one is new if img_name != img[1]: images.append([ img_name.encode("ascii", "ignore"), img_path.encode("ascii", "ignore") ]) # create a dataset with the position and classification f.create_dataset(name=img_name, data=target, dtype=np.float32, compression='gzip', compression_opts=4) # clean all states target = [] img_name = img[1] img_path = img[8] target.append(list(img[14:] + img[10:14])) f.create_dataset(name=img_name, data=target[0], dtype=np.float32, compression='gzip', compression_opts=4) f.create_dataset(name='images', shape=(len(images), 2), data=images, dtype=h5py.special_dtype(vlen=str), compression='gzip', compression_opts=4) finally: f.close()
def savetoqmcpack(cell, mf, title="Default", kpts=[]): import h5py, re from collections import defaultdict from pyscf.pbc import gto, scf, df, dft PBC = False UnRestricted = False Complex = False val = str(mf) ComputeMode = re.split('[. ]', val) SizeMode = len(ComputeMode) for i in range(SizeMode): if ComputeMode[i] in ("UHF", "KUHF", "UKS"): UnRestricted = True if ComputeMode[i] == "pbc": PBC = True if PBC and len(kpts) == 0: sys.exit( "You need to specify explicit the list of K-point (including gamma)" ) IonName = dict([('H', 1), ('He', 2), ('Li', 3), ('Be', 4), ('B', 5), ('C', 6), ('N', 7), ('O', 8), ('F', 9), ('Ne', 10), ('Na', 11), ('Mg', 12), ('Al', 13), ('Si', 14), ('P', 15), ('S', 16), ('Cl', 17), ('Ar', 18), ('K', 19), ('Ca', 20), ('Sc', 21), ('Ti', 22), ('V', 23), ('Cr', 24), ('Mn', 25), ('Fe', 26), ('Co', 27), ('Ni', 28), ('Cu', 29), ('Zn', 30), ('Ga', 31), ('Ge', 32), ('As', 33), ('Se', 34), ('Br', 35), ('Kr', 36), ('Rb', 37), ('Sr', 38), ('Y', 39), ('Zr', 40), ('Nb', 41), ('Mo', 42), ('Tc', 43), ('Ru', 44), ('Rh', 45), ('Pd', 46), ('Ag', 47), ('Cd', 48), ('In', 49), ('Sn', 50), ('Sb', 51), ('Te', 52), ('I', 53), ('Xe', 54), ('Cs', 55), ('Ba', 56), ('La', 57), ('Ce', 58), ('Pr', 59), ('Nd', 60), ('Pm', 61), ('Sm', 62), ('Eu', 63), ('Gd', 64), ('Tb', 65), ('Dy', 66), ('Ho', 67), ('Er', 68), ('Tm', 69), ('Yb', 70), ('Lu', 71), ('Hf', 72), ('Ta', 73), ('W', 74), ('Re', 75), ('Os', 76), ('Ir', 77), ('Pt', 78), ('Au', 79), ('Hg', 80), ('Tl', 81), ('Pb', 82), ('Bi', 83), ('Po', 84), ('At', 85), ('Rn', 86), ('Fr', 87), ('Ra', 88), ('Ac', 89), ('Th', 90), ('Pa', 91), ('U', 92), ('Np', 93)]) H5_qmcpack = h5py.File(title + '.h5', 'w') groupApp = H5_qmcpack.create_group("application") CodeData = groupApp.create_dataset("code", (1, ), dtype="S5") CodeData[0:] = "PySCF" CodeVer = groupApp.create_dataset("version", (3, ), dtype="i4") CodeVer[0:] = 1 CodeVer[1:] = 4 CodeVer[2:] = 2 natom = cell.natm dt = h5py.special_dtype(vlen=bytes) #Group Atoms groupAtom = H5_qmcpack.create_group("atoms") #Dataset Number Of Atoms groupAtom.create_dataset("number_of_atoms", (1, ), dtype="i4", data=natom) #Dataset Number Of Species #Species contains (Atom_Name, Atom_Number,Atom_Charge,Atom_Core) l_atoms = [(cell.atom_symbol(x), IonName[cell.atom_symbol(x)], cell.atom_charge(x), cell.atom_nelec_core(x)) for x in range(natom)] d = defaultdict(list) for i, t in enumerate(l_atoms): d[t].append(i) idxSpeciestoAtoms = dict() uniq_atoms = dict() for i, (k, v) in enumerate(d.items()): idxSpeciestoAtoms[i] = v uniq_atoms[i] = k idxAtomstoSpecies = dict() for k, l_v in idxSpeciestoAtoms.items(): for v in l_v: idxAtomstoSpecies[v] = k NbSpecies = len(idxSpeciestoAtoms.keys()) groupAtom.create_dataset("number_of_species", (1, ), dtype="i4", data=NbSpecies) #Dataset positions MyPos = groupAtom.create_dataset("positions", (natom, 3), dtype="f8") for x in range(natom): MyPos[x:] = cell.atom_coord(x) #Group Atoms for x in range(NbSpecies): atmname = str(uniq_atoms[x][0]) groupSpecies = groupAtom.create_group("species_" + str(x)) groupSpecies.create_dataset("atomic_number", (1, ), dtype="i4", data=uniq_atoms[x][1]) mylen = "S" + str(len(atmname)) AtmName = groupSpecies.create_dataset("name", (1, ), dtype=mylen) AtmName[0:] = atmname groupSpecies.create_dataset("charge", (1, ), dtype="f8", data=uniq_atoms[x][2]) groupSpecies.create_dataset("core", (1, ), dtype="f8", data=uniq_atoms[x][3]) SpeciesID = groupAtom.create_dataset("species_ids", (natom, ), dtype="i4") for x in range(natom): SpeciesID[x:] = idxAtomstoSpecies[x] #Parameter Group GroupParameter = H5_qmcpack.create_group("parameters") GroupParameter.create_dataset("ECP", (1, ), dtype="b1", data=bool(cell.has_ecp())) bohrUnit = True Spin = cell.spin GroupParameter.create_dataset("Unit", (1, ), dtype="b1", data=bohrUnit) GroupParameter.create_dataset("NbAlpha", (1, ), dtype="i4", data=cell.nelec[0]) GroupParameter.create_dataset("NbBeta", (1, ), dtype="i4", data=cell.nelec[1]) GroupParameter.create_dataset("NbTotElec", (1, ), dtype="i4", data=cell.nelec[0] + cell.nelec[1]) GroupParameter.create_dataset("spin", (1, ), dtype="i4", data=Spin) #basisset Group GroupBasisSet = H5_qmcpack.create_group("basisset") #Dataset Number Of Atoms GroupBasisSet.create_dataset("NbElements", (1, ), dtype="i4", data=NbSpecies) LCAOName = GroupBasisSet.create_dataset("name", (1, ), dtype="S8") LCAOName[0:] = "LCAOBSet" #atomicBasisSets Group for x in range(NbSpecies): MyIdx = idxAtomstoSpecies[x] atomicBasisSetGroup = GroupBasisSet.create_group("atomicBasisSet" + str(x)) mylen = "S" + str(len(uniq_atoms[x][0])) elemtype = atomicBasisSetGroup.create_dataset("elementType", (1, ), dtype=mylen) elemtype[0:] = uniq_atoms[x][0] if cell.cart == True: Angular = atomicBasisSetGroup.create_dataset("angular", (1, ), dtype="S9") ExpandYLM = atomicBasisSetGroup.create_dataset("expandYlm", (1, ), dtype="S6") Angular[0:] = "cartesian" ExpandYLM[0:] = "Gamess" else: Angular = atomicBasisSetGroup.create_dataset("angular", (1, ), dtype="S9") ExpandYLM = atomicBasisSetGroup.create_dataset("expandYlm", (1, ), dtype="S5") Angular[0:] = "spherical" ExpandYLM[0:] = "pyscf" atomicBasisSetGroup.create_dataset("grid_npts", (1, ), dtype="i4", data=1001) atomicBasisSetGroup.create_dataset("grid_rf", (1, ), dtype="i4", data=100) atomicBasisSetGroup.create_dataset("grid_ri", (1, ), dtype="f8", data=1e-06) gridType = atomicBasisSetGroup.create_dataset("grid_type", (1, ), dtype="S3") gridType[0:] = "log" try: mylen = "S" + str(len(cell.basis)) nameBase = atomicBasisSetGroup.create_dataset("name", (1, ), dtype=mylen) nameBase[0:] = cell.basis except: nameBase = atomicBasisSetGroup.create_dataset("name", (1, ), dtype="S8") nameBase[0:] = "gaussian" Normalized = atomicBasisSetGroup.create_dataset("normalized", (1, ), dtype="S2") Normalized[0:] = "no" nshell = cell.atom_shell_ids(MyIdx) n = 0 for i in nshell: l = cell.bas_angular(i) contracted_coeffs = cell.bas_ctr_coeff(i) contracted_exp = cell.bas_exp(i) for line in zip(*contracted_coeffs): BasisGroup = atomicBasisSetGroup.create_group("basisGroup" + str(n)) basisType = BasisGroup.create_dataset("type", (1, ), dtype="S8") basisType[0:] = "Gaussian" mylen = "S" + str(len((uniq_atoms[x][0] + str(n) + str(l)))) RID = BasisGroup.create_dataset("rid", (1, ), dtype=mylen) RID[0:] = (uniq_atoms[x][0] + str(n) + str(l)) BasisGroup.create_dataset("Shell_coord", (3, ), dtype="f8", data=cell.bas_coord(i)) BasisGroup.create_dataset("NbRadFunc", (1, ), dtype="i4", data=cell.bas_nprim(i)) Val_l = BasisGroup.create_dataset("l", (1, ), dtype="i4", data=l) Val_n = BasisGroup.create_dataset("n", (1, ), dtype="i4", data=n) RadGroup = BasisGroup.create_group("radfunctions") # print "<basisGroup",n," rid=",uniq_atoms[x][0]+str(n)+str(l)," n=",n," l=",l ,"NbRadFunc=",cell.bas_nprim(i),"type=Gaussian>" IdRad = 0 for e, c in zip(contracted_exp, line): DataRadGrp = RadGroup.create_group("DataRad" + str(IdRad)) DataRadGrp.create_dataset("exponent", (1, ), dtype="f8", data=e) DataRadGrp.create_dataset("contraction", (1, ), dtype="f8", data=c) # print "<radfunc exponent=",e," contraction=",c, "DataRad=",n,"IdRad=",IdRad,"/>" IdRad += 1 n += 1 atomicBasisSetGroup.create_dataset("NbBasisGroups", (1, ), dtype="i4", data=n) def is_complex(l): try: return is_complex(l[0]) except: return bool(l.imag) GroupDet = H5_qmcpack.create_group("determinant") if cell.cart == True: d_gms_order = { 0: ["s"], 1: ["x", "y", "z"], 2: ["xx", "yy", "zz", "xy", "xz", "yz"], 3: [ "xxx", "yyy", "zzz", "xxy", "xxz", "yyx", "yyz", "zzx", "zzy", "xyz" ], 4: [ "xxxx", "yyyy", "zzzz", "xxxy", "xxxz", "yyyx", "yyyz", "zzzx", "zzzy", "xxyy", "xxzz", "yyzz", "xxyz", "yyxz", "zzxy", "xxxx", "yyyy", "zzzz", "xxxy", "xxxz", "yyyx", "yyyz", "zzzx", "zzzy", "xxyy", "xxzz", "yyzz", "xxyz", "yyxz", "zzxy" ] } d_l = {'s': 0, 'p': 1, 'd': 2, 'f': 3, 'g': 4} def n_orbital(n): if n == 0: return 1 elif n == 1: return 3 else: return 2 * n_orbital(n - 1) - n_orbital(n - 2) + 1 def compare_gamess_style(item1, item2): # Warning: # - d_gms_order is a global variable n1, n2 = map(len, (item1, item2)) assert (n1 == n2) try: l = d_gms_order[n1] except KeyError: return 0 else: a = l.index(item1) b = l.index(item2) return cmp(a, b) ao_label = cell.ao_labels(False) # Create a list of shell l_l = [] for label, name, t, l in ao_label: # Change yyx -> xyy " q = "".join(sorted(l, key=l.count, reverse=True)) l_l.append(q) # Pyscf ordering of shell l_order = list(range(len(l_l))) # Shell ordering indexed n = 1 l_order_new = [] for i, (label, name, t, l) in enumerate(ao_label): r = d_l[t[-1]] # print r,n_orbital(r) if n != 1: n -= 1 else: n = n_orbital(r) unordered_l = l_l[i:i + n] unordered = l_order[i:i + n] #print i,n,unordered ordered = [ x for _, x in sorted(zip(unordered_l, unordered), key=lambda p: p[0], cmp=compare_gamess_style) ] l_order_new.extend(ordered) def order_mo_coef(ll): # Order a list of transposed mo_coeff (Ao,Mo) -> (Mo,Ao) ordered # Warning: # - l_order_new is used as global variable # - gamess order ll_new = [] for l in zip(*ll): ll_new.append([l[i] for i in l_order_new]) return ll_new mo_coeff = mf.mo_coeff Complex = is_complex(mo_coeff) if Complex: mytype = "c16" else: mytype = "f8" GroupParameter.create_dataset("IsComplex", (1, ), dtype="b1", data=Complex) GroupParameter.create_dataset("SpinUnResticted", (1, ), dtype="b1", data=UnRestricted) if not PBC: if UnRestricted == False: NbMO = len(mo_coeff) NbAO = len(mo_coeff[0]) if cell.cart == True: eigenset = GroupDet.create_dataset( "eigenset_0", (NbMO, NbAO), dtype="f8", data=order_mo_coef(mo_coeff)) else: eigenset = GroupDet.create_dataset("eigenset_0", (NbMO, NbAO), dtype="f8", data=zip(*mo_coeff)) else: NbMO = len(mo_coeff[0]) NbAO = len(mo_coeff[0][0]) eigenset_up = GroupDet.create_dataset("eigenset_0", (NbMO, NbAO), dtype="f8", data=order_mo_coef( mo_coeff[0])) eigenset_dn = GroupDet.create_dataset("eigenset_1", (NbMO, NbAO), dtype="f8", data=order_mo_coef( mo_coeff[1])) else: #Cell Parameters GroupCell = H5_qmcpack.create_group("Cell") GroupCell.create_dataset("LaticeVectors", (3, 3), dtype="f8", data=cell.lattice_vectors()) Nbkpts = len(kpts) GroupDet.create_dataset("Nb_Kpoints", (1, ), dtype="i4", data=Nbkpts) if not UnRestricted: NbMO = len(mo_coeff[0]) NbAO = len(mo_coeff[0][0]) else: NbMO = len(mo_coeff[0][0]) NbAO = len(mo_coeff[0][0][0]) def get_mo(mo_coeff, cart): return order_mo_coef(mo_coeff) if cart else zip(*mo_coeff) for i in range(Nbkpts): GroupKpts = GroupDet.create_group("Kpoint_" + str(i)) GroupKpts.create_dataset("Coord", (1, 3), dtype="f8", data=kpts[i]) GroupSpin = GroupKpts.create_group("spin_Up") if not UnRestricted: mo_coeff_ = get_mo(mo_coeff[i], cell.cart) GroupSpin.create_dataset("MO_Coeff", (NbMO, NbAO), dtype=mytype, data=mo_coeff_) GroupSpin.create_dataset("MO_EIGENVALUES", (1, NbMO), dtype="f8", data=mf.mo_energy[i]) else: GroupSpindn = GroupKpts.create_group("spin_Dn") mo_coeff_up = get_mo(mo_coeff[0][i], cell.cart) mo_coeff_down = get_mo(mo_coeff[1][i], cell.cart) GroupSpin.create_dataset("MO_Coeff", (NbMO, NbAO), dtype=mytype, data=mo_coeff_up) GroupSpindn.create_dataset("MO_Coeff", (NbMO, NbAO), dtype=mytype, data=mo_coeff_down) GroupSpin.create_dataset("MO_EIGENVALUES", (1, NbMO), dtype="f8", data=mf.mo_energy[0][i]) GroupSpindn.create_dataset("MO_EIGENVALUES", (1, NbMO), dtype="f8", data=mf.mo_energy[1][i]) GroupParameter.create_dataset("COMPLEX", (1, ), dtype="i4", data=Complex) GroupParameter.create_dataset("numMO", (1, ), dtype="i4", data=NbMO) GroupParameter.create_dataset("numAO", (1, ), dtype="i4", data=NbAO) print 'Wavefunction successfuly saved to QMCPACK HDF5 Format' print 'Use: "convert4qmc -Pyscf {}.h5" to generate QMCPACK input files'.format( title) # Close the file before exiting H5_qmcpack.close()
def test_create(self): filename = self.getFileName("create_attribute") print("filename:", filename) f = h5py.File(filename, 'w') is_hsds = False if isinstance(f.id.id, str) and f.id.id.startswith("g-"): is_hsds = True # HSDS has different permission defaults g1 = f.create_group('g1') g1.attrs['a1'] = 42 n = g1.attrs['a1'] self.assertEqual(n, 42) self.assertTrue('a1' in g1.attrs) self.assertTrue(u'a1' in g1.attrs) self.assertTrue(b'a1' in g1.attrs) self.assertEqual(len(g1.attrs), 1) g1.attrs['b1'] = list(range(10)) # try replacing 'a1' g1.attrs['a1'] = 24 self.assertEqual(len(g1.attrs), 2) # create an attribute with explict UTF type dt = h5py.special_dtype(vlen=str) g1.attrs.create('c1', "Hello HDF", dtype=dt) self.assertTrue('c1' in g1.attrs) value = g1.attrs['c1'] self.assertEqual(value, "Hello HDF") # create attribute with as a fixed length string g1.attrs.create('d1', np.string_("This is a numpy string")) value = g1.attrs['d1'] attr_names = [] for a in g1.attrs: attr_names.append(a) self.assertEqual(len(attr_names), 4) self.assertTrue('a1' in attr_names) self.assertTrue('b1' in attr_names) self.assertTrue('c1' in attr_names) self.assertTrue('d1' in attr_names) # create an array attribute g1.attrs["ones"] = np.ones((10, )) arr = g1.attrs["ones"] self.assertTrue(isinstance(arr, np.ndarray)) self.assertEqual(arr.shape, (10, )) for i in range(10): self.assertEqual(arr[i], 1) # array of strings g1.attrs['strings'] = [np.string_("Hello"), np.string_("Good-bye")] arr = g1.attrs['strings'] self.assertEqual(arr.shape, (2, )) self.assertEqual(arr[0], b"Hello") self.assertEqual(arr[1], b"Good-bye") #if six.PY3: # self.assertEqual(arr.dtype, h5py.special_dtype(vlen=str)) #else: self.assertEqual(arr.dtype.kind, 'S') # TBD - h5serv is returning S11 here for some reason #self.assertEqual(arr.dtype, np.dtype("S8")) # scalar byte values g1.attrs['e1'] = "Hello" s = g1.attrs['e1'] self.assertEqual(s, "Hello") # scalar objref attribute g11 = g1.create_group('g1.1') # create subgroup g1/g1.1 g11.attrs['name'] = 'g1.1' # tag group with an attribute if is_hsds: # following is not working with h5serv g11_ref = g11.ref # get ref to g1/g1.1 self.assertTrue(isinstance(g11_ref, h5py.Reference)) refdt = h5py.special_dtype(ref=h5py.Reference) # create ref dtype g1.attrs.create('f1', g11_ref, dtype=refdt) # create attribute with ref to g1.1 ref = g1.attrs['f1'] # read back the attribute refobj = f[ref] # get the ref'd object self.assertTrue('name' in refobj.attrs) # should see the tag attribute self.assertEqual(refobj.attrs['name'], 'g1.1') # check tag value # close file f.close()
print("ouput file:", config.output_file) f = h5py.File(config.output_file, 'w') total_rows = 0 for input_file in config.input_files: if not os.path.isfile(input_file): raise ValueError(input_file + " does not exist") npz_file = np.load(input_file) total_rows += npz_file['event_id'].shape[0] dset_labels = f.create_dataset("labels", shape=(total_rows, ), dtype=np.int32) dset_PATHS = f.create_dataset("root_files", shape=(total_rows, ), dtype=h5py.special_dtype(vlen=str)) dset_IDX = f.create_dataset("event_ids", shape=(total_rows, ), dtype=np.int32) dset_event_data = f.create_dataset("event_data", shape=(total_rows, 27, 27, 38), dtype=np.float32) dset_energies = f.create_dataset("energies", shape=(total_rows, 1), dtype=np.float32) dset_positions = f.create_dataset("positions", shape=(total_rows, 1, 3), dtype=np.float32) dset_angles = f.create_dataset("angles", shape=(total_rows, 2), dtype=np.float32)
def storeClassLabels(self, classLabels): dt = h5py.special_dtype(vlen=str) labelSet = self.db.create_dataset("label_names", (len(classLabels),), dtype=dt) labelSet[:] = classLabels
def prepare_data(input_folder, output_file, size, input_channels, target_resolution): ''' Main function that prepares a dataset from the raw challenge data to an hdf5 dataset ''' if len(size) != 3: raise AssertionError('Inadequate number of size parameters') if len(target_resolution) != 3: raise AssertionError('Inadequate number of target resolution parameters') hdf5_file = h5py.File(output_file, "w") file_list = {'test': [], 'train': [], 'validation': []} logging.info('Counting files and parsing meta data...') pid = 0 for folder in os.listdir(input_folder): print(folder) train_test = test_train_val_split(pid) pid = pid + 1 file_list[train_test].append(folder) n_train = len(file_list['train']) n_test = len(file_list['test']) n_val = len(file_list['validation']) print('Debug: Check if sets add up to correct value:') print(n_train, n_val, n_test, n_train + n_val + n_test) # Create datasets for images and masks data = {} for tt, num_points in zip(['test', 'train', 'validation'], [n_test, n_train, n_val]): if num_points > 0: print([num_points] + list(size) + [input_channels]) data['images_%s' % tt] = hdf5_file.create_dataset("images_%s" % tt, [num_points] + list(size) + [input_channels], dtype=np.float32) data['masks_%s' % tt] = hdf5_file.create_dataset("masks_%s" % tt, [num_points] + list(size), dtype=np.uint8) data['pids_%s' % tt] = hdf5_file.create_dataset("pids_%s" % tt, [num_points] , dtype=h5py.special_dtype(vlen=str)) mask_list = {'test': [], 'train': [], 'validation': []} img_list = {'test': [], 'train': [], 'validation': []} pids_list = {'test': [], 'train': [], 'validation': []} logging.info('Parsing image files') #get max dimension in z-axis maxX = 0 maxY = 0 maxZ = 0 # maxXCropped = 0 # maxYCropped = 0 # maxZCropped = 0 i = 0 for train_test in ['test', 'train', 'validation']: for folder in file_list[train_test]: print("Doing file {}".format(i)) i += 1 baseFilePath = os.path.join(input_folder, folder, folder) img_c1, _, img_header = utils.load_nii(baseFilePath + "_t1.nii.gz") img_c2, _, _ = utils.load_nii(baseFilePath + "_t1ce.nii.gz") img_c3, _, _ = utils.load_nii(baseFilePath + "_t2.nii.gz") img_c4, _, _ = utils.load_nii(baseFilePath + "_flair.nii.gz") img_dat = np.stack((img_c1, img_c2, img_c3, img_c4), 3) maxX = max(maxX, img_dat.shape[0]) maxY = max(maxY, img_dat.shape[1]) maxZ = max(maxZ, img_dat.shape[2]) # img_dat_cropped = crop_volume_allDim(img_dat) # maxXCropped = max(maxXCropped, img_dat_cropped.shape[0]) # maxYCropped = max(maxYCropped, img_dat_cropped.shape[1]) # maxZCropped = max(maxZCropped, img_dat_cropped.shape[2]) print("Max x: {}, y: {}, z: {}".format(maxX, maxY, maxZ)) # print("Max cropped x: {}, y: {}, z: {}".format(maxXCropped, maxYCropped, maxZCropped)) for train_test in ['train', 'test', 'validation']: write_buffer = 0 counter_from = 0 for folder in file_list[train_test]: logging.info('-----------------------------------------------------------') logging.info('Doing: %s' % folder) patient_id = folder baseFilePath = os.path.join(input_folder, folder, folder) img_c1, _, img_header = utils.load_nii(baseFilePath + "_t1.nii.gz") img_c2, _, _ = utils.load_nii(baseFilePath + "_t1ce.nii.gz") img_c3, _, _ = utils.load_nii(baseFilePath + "_t2.nii.gz") img_c4, _, _ = utils.load_nii(baseFilePath + "_flair.nii.gz") mask, _, _ = utils.load_nii(baseFilePath + "_seg.nii.gz") img = np.stack((img_c1, img_c2, img_c3, img_c4), 3) # img, mask = crop_volume_allDim(img_dat.copy(), mask_dat.copy()) pixel_size = (img_header.structarr['pixdim'][1], img_header.structarr['pixdim'][2], img_header.structarr['pixdim'][3]) logging.info('Pixel size:') logging.info(pixel_size) ### PROCESSING LOOP FOR 3D DATA ################################ scale_vector = [pixel_size[0] / target_resolution[0], pixel_size[1] / target_resolution[1], pixel_size[2]/ target_resolution[2]] if scale_vector != [1.0, 1.0, 1.0]: img = transform.rescale(img, scale_vector, order=1, preserve_range=True, multichannel=True, mode='constant') mask = transform.rescale(mask, scale_vector, order=0, preserve_range=True, multichannel=False, mode='constant') img = crop_or_pad_slice_to_size(img, size, input_channels) mask = crop_or_pad_slice_to_size(mask, size) img = normalise_image(img) img_list[train_test].append(img) mask_list[train_test].append(mask) pids_list[train_test].append(patient_id) write_buffer += 1 if write_buffer >= MAX_WRITE_BUFFER: counter_to = counter_from + write_buffer _write_range_to_hdf5(data, train_test, img_list, mask_list, pids_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, pids_list, train_test) # reset stuff for next iteration counter_from = counter_to write_buffer = 0 logging.info('Writing remaining data') counter_to = counter_from + write_buffer if len(file_list[train_test]) > 0: _write_range_to_hdf5(data, train_test, img_list, mask_list, pids_list, counter_from, counter_to) _release_tmp_memory(img_list, mask_list, pids_list, train_test) # After test train loop: hdf5_file.close()
def pretrain_save_data_parameters( data_dir, speakers_file='speakers.list', params_file='pretrain_params.h5', ): # TODO Document this function # Save processing start time start_time = time() print('Starting') longest_sequence = 0 files_list = [] num_spk = len([entry for entry in scandir(data_dir) if entry.is_dir()]) spk_max = np.zeros((num_spk, 42)) spk_min = 1e+50 * np.ones((num_spk, 42)) speakers = open(os.path.join(data_dir, speakers_file), 'r').readlines() # Strip '\n' characters dirs = [line.split('\n')[0] for line in speakers] print("Processing speakers' data") for spk_index, a_dir in enumerate(dirs): for sub_root, _, sub_files in os.walk(os.path.join(data_dir, a_dir)): # Get basenames of files in directory basenames = list( set([ os.path.join(sub_root, file.split('.')[0]) for file in sub_files ])) files_list += basenames for basename in basenames: print('Processing ' + basename) lf0_params = parse_file(1, basename + '.lf0_log') if lf0_params.shape[0] > longest_sequence: longest_sequence = lf0_params.shape[0] mcp_params = parse_file(40, basename + '.cc') mvf_params = parse_file(1, basename + '.i.fv') seq_params = np.concatenate( (mcp_params, lf0_params, mvf_params), axis=1) # Compute maximum and minimum values spk_max[spk_index, :] = np.maximum( spk_max[spk_index, :], np.ma.max(seq_params, axis=0)) spk_min[spk_index, :] = np.minimum( spk_min[spk_index, :], np.ma.min(seq_params, axis=0)) print('Saving data to .h5 file') with File(os.path.join(data_dir, params_file), 'w') as f: # Save longest_sequence and the max and min values as attributes f.attrs.create('longest_sequence', longest_sequence, dtype=int) f.attrs.create('speakers_max', spk_max) f.attrs.create('speakers_min', spk_min) # TODO Support Python 2 # sys.version_info -> Get running Python version dt = special_dtype(vlen=str) utf_list = [ n.encode(encoding="utf-8", errors="ignore") for n in files_list ] f.create_dataset(name='files_list', shape=(len(utf_list), 1), data=utf_list, dtype=dt) f.close() print('Elapsed time: ' + display_time(time() - start_time)) longest_sequence = int(np.floor(longest_sequence * 1.7)) return longest_sequence, spk_max, spk_min, files_list
import h5py import io import numpy as np data_dir = "data_10000" + "/mnt/ramdisk/max/90kDICT32px/" level_0 = glob.glob(os.path.join(data_dir + "/*")) if len(level_0) == 0: raise ValueError("No files in directoy" + data_dir) filename = "hdf5-numpy-10000.hdf5" if os.path.exists(filename): os.remove(filename) f = h5py.File(filename) dt = h5py.special_dtype(vlen=np.dtype('uint8')) dset = f.create_dataset('images', (10000, ), dtype=dt) ################################# SQL file with meta info ### sqlite file filename = "hdf5-numpy-10000.sqlite" if os.path.exists(filename): os.remove(filename) sql_conn = sqlite3.connect(filename) cur = sql_conn.cursor() sql_table = 'CREATE TABLE IF NOT EXISTS meta (key integer PRIMARY KEY, label text NOT NULL, path text, shape0 integer, shape1 integer, shape2 integer);' cur.execute(sql_table) ################################# files_counter = 0 tic = time.time()
def contributors(self, value: list[str]): self._contributors = np.asarray(value, dtype=h5py.special_dtype(vlen=str))
def H5AnnotationFile(annotype, annoid, kv=None): """Create an HDF5 file and populate the fields. Return a file object. This is a support routine for all the RAMON tests.""" # Create an in-memory HDF5 file tmpfile = tempfile.NamedTemporaryFile() h5fh = h5py.File(tmpfile.name) # Create the top level annotation id namespace idgrp = h5fh.create_group(str(annoid)) # Annotation type idgrp.create_dataset("ANNOTATION_TYPE", (1, ), np.uint32, data=annotype) # Create a metadata group mdgrp = idgrp.create_group("METADATA") # now lets add a bunch of random values for the specific annotation type ann_status = random.randint(0, 4) ann_confidence = random.random() ann_author = 'randal' # Set Annotation specific metadata mdgrp.create_dataset("STATUS", (1, ), np.uint32, data=ann_status) mdgrp.create_dataset("CONFIDENCE", (1, ), np.float, data=ann_confidence) mdgrp.create_dataset("AUTHOR", (1, ), dtype=h5py.special_dtype(vlen=str), data=ann_author) kvpairs = {} if kv != None: [k, sym, v] = kv.partition(':') kvpairs[k] = v # Turn our dictionary into a csv file fstring = cStringIO.StringIO() csvw = csv.writer(fstring, delimiter=',') csvw.writerows([r for r in kvpairs.iteritems()]) # User-defined metadata mdgrp.create_dataset("KVPAIRS", (1, ), dtype=h5py.special_dtype(vlen=str), data=fstring.getvalue()) # Synapse: if annotype == 2: syn_weight = random.random() * 1000.0 syn_synapse_type = random.randint(1, 9) syn_seeds = [random.randint(1, 1000) for x in range(5)] syn_segments = [[random.randint(1, 1000), random.randint(1, 1000)] for x in range(4)] mdgrp.create_dataset("WEIGHT", (1, ), np.float, data=syn_weight) mdgrp.create_dataset("SYNAPSE_TYPE", (1, ), np.uint32, data=syn_synapse_type) mdgrp.create_dataset("SEEDS", (len(syn_seeds), ), np.uint32, data=syn_seeds) mdgrp.create_dataset("SEGMENTS", (len(syn_segments), 2), np.uint32, data=syn_segments) # Seed elif annotype == 3: seed_parent = random.randint(1, 1000) seed_position = [random.randint(1, 10000) for x in range(3)] seed_cubelocation = random.randint(1, 9) seed_source = random.randint(1, 1000) mdgrp.create_dataset("PARENT", (1, ), np.uint32, data=seed_parent) mdgrp.create_dataset("CUBE_LOCATION", (1, ), np.uint32, data=seed_cubelocation) mdgrp.create_dataset("SOURCE", (1, ), np.uint32, data=seed_source) mdgrp.create_dataset("POSITION", (3, ), np.uint32, data=seed_position) # Segment elif annotype == 4: seg_parentseed = random.randint(1, 100000) seg_segmentclass = random.randint(1, 9) seg_neuron = random.randint(1, 100000) seg_synapses = [random.randint(1, 100000) for x in range(5)] seg_organelles = [random.randint(1, 100000) for x in range(5)] mdgrp.create_dataset("SEGMENTCLASS", (1, ), np.uint32, data=seg_segmentclass) mdgrp.create_dataset("PARENTSEED", (1, ), np.uint32, data=seg_parentseed) mdgrp.create_dataset("NEURON", (1, ), np.uint32, data=seg_neuron) mdgrp.create_dataset("SYNAPSES", (len(seg_synapses), ), np.uint32, seg_synapses) mdgrp.create_dataset("ORGANELLES", (len(seg_organelles), ), np.uint32, seg_organelles) # Neuron elif annotype == 5: neuron_segments = [random.randint(1, 1000) for x in range(10)] mdgrp.create_dataset("SEGMENTS", (len(neuron_segments), ), np.uint32, neuron_segments) # Organelle elif annotype == 6: org_parentseed = random.randint(1, 100000) org_organelleclass = random.randint(1, 9) org_seeds = [random.randint(1, 100000) for x in range(5)] org_centroid = [random.randint(1, 10000) for x in range(3)] mdgrp.create_dataset("ORGANELLECLASS", (1, ), np.uint32, data=org_organelleclass) mdgrp.create_dataset("PARENTSEED", (1, ), np.uint32, data=org_parentseed) mdgrp.create_dataset("SEEDS", (len(org_seeds), ), np.uint32, org_seeds) mdgrp.create_dataset("CENTROID", (3, ), np.uint32, data=org_centroid) h5fh.flush() tmpfile.seek(0) return tmpfile
def utf8(string): if isinstance(string, bytes): return bytes.decode(string, 'utf-8') return string def py_str(byte_string): if isinstance(byte_string, np.ndarray): byte_string = bytes(byte_string) assert isinstance(byte_string, bytes) return byte_string.decode('ASCII') def isstring(s): return isinstance(s, str) def execcode(code, globals, locals=None): if locals is None: exec(code, globals) else: exec(code, globals, locals) h5vstring = h5py.special_dtype(vlen=bytes) import builtins import activepapers.builtins3 as ap_builtins # Replace the "del exec" in builtins3 by something that's not a # syntax error under Python 2. del ap_builtins.__dict__['exec'] raw_input = builtins.input
def createSpecificSynapse(annoid, syn_segments, cutout): # Create an in-memory HDF5 file tmpfile = tempfile.NamedTemporaryFile() h5fh = h5py.File(tmpfile.name) # Create the top level annotation id namespace idgrp = h5fh.create_group(str(annoid)) # Annotation type idgrp.create_dataset("ANNOTATION_TYPE", (1, ), np.uint32, data=2) # Create a metadata group mdgrp = idgrp.create_group("METADATA") # now lets add a bunch of random values for the specific annotation type ann_status = random.randint(0, 4) ann_confidence = random.random() ann_author = 'randal' # Set Annotation specific metadata mdgrp.create_dataset("STATUS", (1, ), np.uint32, data=ann_status) mdgrp.create_dataset("CONFIDENCE", (1, ), np.float, data=ann_confidence) mdgrp.create_dataset("AUTHOR", (1, ), dtype=h5py.special_dtype(vlen=str), data=ann_author) syn_weight = random.random() * 1000.0 syn_synapse_type = random.randint(1, 9) [resstr, xstr, ystr, zstr] = cutout.split('/') (xlowstr, xhighstr) = xstr.split(',') (ylowstr, yhighstr) = ystr.split(',') (zlowstr, zhighstr) = zstr.split(',') resolution = int(resstr) xlow = int(xlowstr) xhigh = int(xhighstr) ylow = int(ylowstr) yhigh = int(yhighstr) zlow = int(zlowstr) zhigh = int(zhighstr) anndata = np.ones([zhigh - zlow, yhigh - ylow, xhigh - xlow]) #import pdb; pdb.set_trace() mdgrp.create_dataset("WEIGHT", (1, ), np.float, data=syn_weight) mdgrp.create_dataset("SYNAPSE_TYPE", (1, ), np.uint32, data=syn_synapse_type) mdgrp.create_dataset("SEGMENTS", (len(syn_segments), 2), np.uint32, data=syn_segments) idgrp.create_dataset("RESOLUTION", (1, ), np.uint32, data=resolution) idgrp.create_dataset("XYZOFFSET", (3, ), np.uint32, data=[xlow, ylow, zlow]) idgrp.create_dataset("CUTOUT", anndata.shape, np.uint32, data=anndata) h5fh.flush() tmpfile.seek(0) return tmpfile
def load_data(path, file_ext=['txt'], valid_split=None, vocab_file_name=None, max_vocab_size=None, max_len_w=None, output_path=None, subset_pct=100): """ Given a path where data are saved, look for the ones with the right extensions If a split factor is given, it will split all the files into training and valid set. Then build vocabulary from the training and validation sets. Arguments: path: which directory to look for all the documents file_ext: what extension of the files to look for valid_split: to split the data into train/valid set. If None, no split vocab_file_name: optional file name. If None, the script will decide a name given path and split max_vocab_size: maximum number of words to use in vocabulary (by most frequent) max_len_w: maximum length of sentences in words output_path: path used to save preprocessed data and resuts subset_pct: subset of dataset to load into H5 file (percentage) Returns: The function saves 2 files: h5 file with preprocessed data vocabulary file with: vocab, reverse_vocab, word_count """ file_names = get_file_list(path, file_ext) file_str = get_file_str(path, len(file_names), labelled=False, valid_split=valid_split, subset_pct=subset_pct) # create output dir if needed if not os.path.isdir(output_path): os.makedirs(output_path) # file name to store the vocabulary if vocab_file_name is None: vocab_file_name = file_str + '.vocab' vocab_file_name = os.path.join(output_path, vocab_file_name) # If max sizes arent set, assume no limit if not max_len_w: max_len_w = sys.maxsize if not max_vocab_size: max_vocab_size = sys.maxsize # file name to store the pre-processed train/valid dataset h5_file_name = os.path.join(output_path, file_str + '.h5') if os.path.exists(h5_file_name) and os.path.exists(vocab_file_name): neon_logger.display( "dataset files {} and vocabulary file {} already exist. " "will use cached data. ".format(h5_file_name, vocab_file_name)) return h5_file_name, vocab_file_name # split into training/valid set if valid_split is not None: if 'json' in file_ext: # Split based on number of files train_split = int(np.ceil(len(file_names) * (1 - valid_split))) train_files = file_names[:train_split] valid_files = file_names[train_split:] train_sent = load_json_sent(train_files, subset_pct) valid_sent = load_json_sent(valid_files, subset_pct) all_sent = train_sent + valid_sent elif 'txt' in file_ext: # Split based on number of lines (since only 2 files) all_sent = load_txt_sent(file_names, subset_pct) train_split = int(np.ceil(len(all_sent) * (1 - valid_split))) train_sent = all_sent[:train_split] valid_sent = all_sent[train_split:] else: neon_logger.display( "Unsure how to load file_ext {}, please use 'json' or 'txt'.". format(file_ext)) else: train_files = file_names if 'json' in file_ext: train_sent = load_json_sent(train_files, subset_pct) elif 'txt' in file_ext: train_sent = load_txt_sent(train_files, subset_pct) else: neon_logger.display( "Unsure how to load file_ext {}, please use 'json' or 'txt'.". format(file_ext)) all_sent = train_sent if os.path.exists(vocab_file_name): neon_logger.display( "open existing vocab file: {}".format(vocab_file_name)) vocab, rev_vocab, word_count = load_obj(vocab_file_name) else: neon_logger.display("Building vocab file") # build vocab word_count = defaultdict(int) for sent in all_sent: sent_words = tokenize(sent) if len(sent_words) > max_len_w or len(sent_words) == 0: continue for word in sent_words: word_count[word] += 1 # sort the word_count , re-assign ids by its frequency. Useful for downstream tasks # only done for train vocab vocab_sorted = sorted(word_count.items(), key=lambda kv: kv[1], reverse=True) vocab = OrderedDict() # get word count as array in same ordering as vocab (but with maximum length) word_count_ = np.zeros((len(word_count), ), dtype=np.int64) for i, t in enumerate(list(zip(*vocab_sorted))[0][:max_vocab_size]): word_count_[i] = word_count[t] vocab[t] = i word_count = word_count_ # generate the reverse vocab rev_vocab = dict((wrd_id, wrd) for wrd, wrd_id in vocab.items()) neon_logger.display("vocabulary from {} is saved into {}".format( path, vocab_file_name)) save_obj((vocab, rev_vocab, word_count), vocab_file_name) vocab_size = len(vocab) neon_logger.display( "\nVocab size from the dataset is: {}".format(vocab_size)) neon_logger.display( "\nProcessing and saving training data into {}".format(h5_file_name)) # now process and save the train/valid data h5f = h5py.File(h5_file_name, 'w', libver='latest') shape, maxshape = (len(train_sent), ), (None) dt = np.dtype([('text', h5py.special_dtype(vlen=str)), ('num_words', np.uint16)]) report_text_train = h5f.create_dataset('report_train', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') report_train = h5f.create_dataset('train', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') # map text to integers wdata = np.zeros((1, ), dtype=dt) ntrain = 0 for sent in train_sent: text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)] # enforce maximum sentence length if len(text_int) > max_len_w or len(text_int) == 0: continue report_train[ntrain] = text_int wdata['text'] = clean_string(sent) wdata['num_words'] = len(text_int) report_text_train[ntrain] = wdata ntrain += 1 report_train.attrs['nsample'] = ntrain report_train.attrs['vocab_size'] = vocab_size report_text_train.attrs['nsample'] = ntrain report_text_train.attrs['vocab_size'] = vocab_size if valid_split: neon_logger.display( "\nProcessing and saving validation data into {}".format( h5_file_name)) shape = (len(valid_sent), ) report_text_valid = h5f.create_dataset('report_valid', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') report_valid = h5f.create_dataset( 'valid', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') nvalid = 0 for sent in valid_sent: text_int = [ -1 if t not in vocab else vocab[t] for t in tokenize(sent) ] # enforce maximum sentence length if len(text_int) > max_len_w or len(text_int) == 0: continue report_valid[nvalid] = text_int wdata['text'] = clean_string(sent) wdata['num_words'] = len(text_int) report_text_valid[nvalid] = wdata nvalid += 1 report_valid.attrs['nsample'] = nvalid report_valid.attrs['vocab_size'] = vocab_size report_text_valid.attrs['nsample'] = nvalid report_text_valid.attrs['vocab_size'] = vocab_size h5f.close() return h5_file_name, vocab_file_name
def WriteModelVariables(self): scalarVariables = self.modelVariable # Get maximum length of string vectors #maxLenName = self._getMaxLength(scalarVariables.keys()) #maxLenDescription = self._getMaxLength([x.description for x in scalarVariables.values()]) # Create dtype object numpyDataType = numpy.dtype({ 'names': [ 'name', 'simpleTypeRow', 'causality', 'variability', 'description', 'objectId', 'column', 'negated' ], 'formats': [ h5py.special_dtype( vlen=unicode), #'S' + str(max(maxLenName, 1)), 'uint32', h5py.special_dtype(enum=(numpy.uint8, CausalityType)), # 'uint8', h5py.special_dtype(enum=(numpy.uint8, VariabilityType)), # 'uint8', h5py.special_dtype( vlen=unicode), #'S' + str(max(maxLenDescription, 1)), h5py.special_dtype(ref=h5py.Reference), 'uint32', h5py.special_dtype(enum=(numpy.uint8, { 'false': 0, 'true': 1 })) ] }) # 'uint8']}) self.description = self.file.create_group("ModelDescription") # Write information on Simulation group description = self.modelDescription self.description.attrs['modelName'] = description.modelName self.description.attrs['description'] = description.description self.description.attrs['author'] = description.author self.description.attrs['version'] = description.version self.description.attrs['generationTool'] = description.generationTool self.description.attrs[ 'generationDateAndTime'] = description.generationDateAndTime self.description.attrs[ 'variableNamingConvention'] = description.variableNamingConvention dataset = self.description.create_dataset( 'Variables', (len(scalarVariables), 1), dtype=numpyDataType, maxshape=(len(scalarVariables), 1), compression='gzip') # Sort Variables by names nameList = [x for x in scalarVariables.keys()] nameList.sort() allData = [] i = -1 for variableName in nameList: variable = scalarVariables[variableName] i += 1 variable.rowIndex = i x = variableName allData.append((x, variable.simpleTypeRow, variable.causality, variable.variability, variable.description, variable.category.dataset.ref, variable.columnIndex, variable.aliasNegated)) dataset[:, 0] = allData return
def test_create_vlen(self): filename = self.getFileName("create_vlen_attribute") print("filename:", filename) f = h5py.File(filename, 'w') is_hsds = False if isinstance(f.id.id, str) and f.id.id.startswith("g-"): is_hsds = True # HSDS has different permission defaults if not is_hsds: # vlen ref types not working for h5serv, so abort here f.close() return g1 = f.create_group('g1') g1_1 = g1.create_group('g1_1') g1_1.attrs["name"] = 'g1_1' g1_2 = g1.create_group('g1_2') g1_2.attrs["name"] = 'g1_2' g1_3 = g1.create_group('g1_3') g1_3.attrs["name"] = 'g1_3' # create an attribute that is a VLEN int32 dtvlen = h5py.special_dtype(vlen=np.dtype('int32')) e0 = np.array([0, 1, 2]) e1 = np.array([0, 1, 2, 3]) data = np.array([e0, e1], dtype=object) g1.attrs.create("a1", data, shape=(2, ), dtype=dtvlen) ret_val = g1.attrs["a1"] self.assertTrue(isinstance(ret_val, np.ndarray)) self.assertEqual(len(ret_val), 2) self.assertTrue(isinstance(ret_val[0], np.ndarray)) # py36 attribute[a1]: [array([0, 1, 2], dtype=int32) array([0, 1, 2, 3], dtype=int32)] # py27 [(0, 1, 2) (0, 1, 2, 3)] self.assertEqual(list(ret_val[0]), [0, 1, 2]) self.assertEqual(ret_val[0].dtype, np.dtype('int32')) self.assertTrue(isinstance(ret_val[1], np.ndarray)) self.assertEqual(ret_val[1].dtype, np.dtype('int32')) self.assertEqual(list(ret_val[1]), [0, 1, 2, 3]) # create an attribute that is VLEN ObjRef dtref = h5py.special_dtype(ref=h5py.Reference) dtvlen = h5py.special_dtype(vlen=dtref) e0 = np.array((g1_1.ref, ), dtype=dtref) e1 = np.array((g1_1.ref, g1_2.ref), dtype=dtref) e2 = np.array((g1_1.ref, g1_2.ref, g1_3.ref), dtype=dtref) data = [e0, e1, e2] g1.attrs.create("b1", data, shape=(3, ), dtype=dtvlen) vlen_val = g1.attrs["b1"] # read back attribute self.assertTrue(isinstance(vlen_val, np.ndarray)) self.assertEqual(len(vlen_val), 3) for i in range(3): e = vlen_val[i] self.assertTrue(isinstance(e, np.ndarray)) ref_type = h5py.check_dtype(ref=e.dtype) self.assertEqual(ref_type, h5py.Reference) self.assertEqual(e.shape, ((i + 1), )) # first element is always a ref to g1 refd_group = f[e[0]] self.assertEqual(refd_group.attrs['name'], 'g1_1') # create an attribute with compound type of vlen objref and int32 dtcompound = np.dtype([('refs', dtvlen), ('number', 'int32')]) # create np array with data for the attribute # note: two step process is needed, see: https://github.com/h5py/h5py/issues/573 data = np.zeros((2, ), dtype=dtcompound) data[0] = (e1, 1) data[1] = (e2, 2) g1.attrs.create("c1", data, shape=(2, ), dtype=dtcompound) compound_val = g1.attrs["c1"] self.assertTrue(isinstance(compound_val, np.ndarray)) self.assertEqual(len(compound_val), 2) self.assertEqual(len(compound_val.dtype), 2) for i in range(2): item = compound_val[i] self.assertTrue(isinstance(item, np.void)) self.assertEqual(len(item), 2) e = item[0] self.assertEqual(len(e), i + 2) refd_group = f[e[0]] self.assertEqual(refd_group.attrs['name'], 'g1_1') self.assertEqual(item[1], i + 1) # close file f.close()
def addAnno ( self, annotype, annoid, kv=None ): """Add an annotation to the file.""" # Create the top level annotation id namespace idgrp = self.h5fh.create_group ( str(annoid) ) # Annotation type idgrp.create_dataset ( "ANNOTATION_TYPE", (1,), np.uint32, data=annotype ) # Create a metadata group mdgrp = idgrp.create_group ( "METADATA" ) # now lets add a bunch of random values for the specific annotation type ann_status = random.randint(0,4) ann_confidence = random.random() ann_author = 'randal' # Set Annotation specific metadata mdgrp.create_dataset ( "STATUS", (1,), np.uint32, data=ann_status ) mdgrp.create_dataset ( "CONFIDENCE", (1,), np.float, data=ann_confidence ) mdgrp.create_dataset ( "AUTHOR", (1,), dtype=h5py.special_dtype(vlen=str), data=ann_author ) kvpairs={} if kv!= None: [ k, sym, v ] = kv.partition(':') kvpairs[k]=v # Turn our dictionary into a csv file fstring = cStringIO.StringIO() csvw = csv.writer(fstring, delimiter=',') csvw.writerows([r for r in kvpairs.iteritems()]) # User-defined metadata mdgrp.create_dataset ( "KVPAIRS", (1,), dtype=h5py.special_dtype(vlen=str), data=fstring.getvalue()) # Synapse: if annotype == 2: syn_weight = random.random()*1000.0 syn_synapse_type = random.randint(1,9) syn_seeds = [ random.randint(1,1000) for x in range(5) ] syn_centroid = [ random.randint(1,10000) for x in range(3) ] syn_segments = [ random.randint(1,1000) for x in range(4) ] syn_presegments = [ random.randint(1,1000) for x in range(3) ] syn_postsegments = [ random.randint(1,1000) for x in range(2) ] mdgrp.create_dataset ( "WEIGHT", (1,), np.float, data=syn_weight ) mdgrp.create_dataset ( "SYNAPSE_TYPE", (1,), np.uint32, data=syn_synapse_type ) mdgrp.create_dataset ( "CENTROID", (3,), np.uint32, data=syn_centroid ) mdgrp.create_dataset ( "SEEDS", (len(syn_seeds),), np.uint32, data=syn_seeds ) mdgrp.create_dataset ( "SEGMENTS", (len(syn_segments),), np.uint32, data=syn_segments) mdgrp.create_dataset ( "PRESEGMENTS", (len(syn_presegments),), np.uint32, data=syn_presegments) mdgrp.create_dataset ( "POSTSEGMENTS", (len(syn_postsegments),), np.uint32, data=syn_postsegments) # Seed elif annotype == 3: seed_parent = random.randint(1,1000) seed_position = [ random.randint(1,10000) for x in range(3) ] seed_cubelocation = random.randint(1,9) seed_source = random.randint(1,1000) mdgrp.create_dataset ( "PARENT", (1,), np.uint32, data=seed_parent ) mdgrp.create_dataset ( "CUBE_LOCATION", (1,), np.uint32, data=seed_cubelocation ) mdgrp.create_dataset ( "SOURCE", (1,), np.uint32, data=seed_source ) mdgrp.create_dataset ( "POSITION", (3,), np.uint32, data=seed_position ) # Segment elif annotype == 4: seg_parentseed = random.randint(1,100000) seg_segmentclass = random.randint(1,9) seg_neuron = random.randint(1,100000) seg_synapses = [ random.randint(1,100000) for x in range(5) ] seg_organelles = [ random.randint(1,100000) for x in range(5) ] mdgrp.create_dataset ( "SEGMENTCLASS", (1,), np.uint32, data=seg_segmentclass ) mdgrp.create_dataset ( "PARENTSEED", (1,), np.uint32, data=seg_parentseed ) mdgrp.create_dataset ( "NEURON", (1,), np.uint32, data=seg_neuron ) mdgrp.create_dataset ( "SYNAPSES", (len(seg_synapses),), np.uint32, seg_synapses ) mdgrp.create_dataset ( "ORGANELLES", (len(seg_organelles),), np.uint32, seg_organelles ) # Neuron elif annotype == 5: neuron_segments = [ random.randint(1,1000) for x in range(10) ] mdgrp.create_dataset ( "SEGMENTS", (len(neuron_segments),), np.uint32, neuron_segments ) # Organelle elif annotype == 6: org_parentseed = random.randint(1,100000) org_organelleclass = random.randint(1,9) org_seeds = [ random.randint(1,100000) for x in range(5) ] org_centroid = [ random.randint(1,10000) for x in range(3) ] mdgrp.create_dataset ( "ORGANELLECLASS", (1,), np.uint32, data=org_organelleclass ) mdgrp.create_dataset ( "PARENTSEED", (1,), np.uint32, data=org_parentseed ) mdgrp.create_dataset ( "SEEDS", (len(org_seeds),), np.uint32, org_seeds ) mdgrp.create_dataset ( "CENTROID", (3,), np.uint32, data=org_centroid ) # Node elif annotype == 7: node_nodetype = random.randint(1,9) node_skeletonid = random.randint(1,9) node_parentid = random.randint(1,100000) node_location = [ random.randint(1,10000) for x in range(3) ] node_children = [ random.randint(1,10000) for x in range(5) ] node_diameter = random.random() mdgrp.create_dataset ( "NODETYPE", (1,), np.uint32, data=node_nodetype ) mdgrp.create_dataset ( "SKELETONID", (1,), np.uint32, data=node_skeletonid ) mdgrp.create_dataset ( "PARENTID", (1,), np.uint32, data=node_parentid ) mdgrp.create_dataset ( "LOCATION", (3,), np.uint32, data=node_location ) mdgrp.create_dataset ( "CHILDREN", (5,), np.uint32, data=node_children ) mdgrp.create_dataset ( "DIAMETER", (1,), np.float, data=node_diameter ) # Skeleton elif annotype == 8: skel_skeltype = random.randint(1,9) skel_rootnode = random.randint(1,100000) mdgrp.create_dataset ( "SKELETONTYPE", (1,), np.uint32, data=skel_skeltype ) mdgrp.create_dataset ( "ROOTNODE", (1,), np.uint32, data=skel_rootnode )
def write_dict_to_hdf5(self, data_dict, entry_point): """ Write a (nested) dictionary to HDF5 Args: data_dict (dict): Dicionary to be written entry_point (object): Object to write to """ for key, item in data_dict.items(): if isinstance(key, (float, int)): key = '__' + str(type(key)) + '__' + str(key) if isinstance(item, (str, bool, float, int)): entry_point.attrs[key] = item elif isinstance(item, np.ndarray): entry_point.create_dataset(key, data=item) elif isinstance(item, (np.int32, np.int64)): entry_point.attrs[key] = int(item) elif item is None: # as h5py does not support saving None as attribute # I create special string, note that this can create # unexpected behaviour if someone saves a string with this name entry_point.attrs[key] = 'NoneType:__None__' elif isinstance(item, dict): entry_point.create_group(key) self.write_dict_to_hdf5(data_dict=item, entry_point=entry_point[key]) elif isinstance(item, tuple): self._write_list_group(key, item, entry_point, 'tuple') elif isinstance(item, list): if len(item) > 0: elt_type = type(item[0]) if all(isinstance(x, elt_type) for x in item): if isinstance(item[0], (int, float, np.int32, np.int64)): entry_point.create_dataset(key, data=np.array(item)) entry_point[key].attrs['list_type'] = 'array' elif isinstance(item[0], str): dt = h5py.special_dtype(vlen=str) data = np.array(item) data = data.reshape((-1, 1)) ds = entry_point.create_dataset(key, (len(data), 1), dtype=dt) ds[:] = data elif isinstance(item[0], dict): entry_point.create_group(key) group_attrs = entry_point[key].attrs group_attrs['list_type'] = 'dict' base_list_key = 'list_idx_{}' group_attrs['base_list_key'] = base_list_key group_attrs['list_length'] = len(item) for i, list_item in enumerate(item): list_item_grp = entry_point[key].create_group( base_list_key.format(i)) self.write_dict_to_hdf5( data_dict=list_item, entry_point=list_item_grp) else: logging.warning( 'List of type "{}" for "{}":"{}" not ' 'supported, storing as string'.format( elt_type, key, item)) entry_point.attrs[key] = str(item) else: self._write_list_group(key, item, entry_point, 'list') else: # as h5py does not support saving None as attribute entry_point.attrs[key] = 'NoneType:__emptylist__' else: logging.warning('Type "{}" for "{}":"{}" not supported, ' 'storing as string'.format( type(item), key, item)) entry_point.attrs[key] = str(item)
def storeClassLables(self, classLabels): # Create a dataset to store the actual class lavel names, then store the clas lael dt = h5py.special_dtype(vlen=unicode) labelSet = self.db.create_dataset("label_names", (len(classLabels), ), dtype=dt) labelSet[:] = classLabels
from .mockdata import write_file from .mockdata.xgm import XGM from .mockdata.gec_camera import GECCamera from .mockdata.basler_camera import BaslerCamera as BaslerCam from .mockdata.adc import ADC from .mockdata.uvlamp import UVLamp from .mockdata.motor import Motor from .mockdata.tsens import TemperatureSensor from .mockdata.imgfel import IMGFELCamera, IMGFELMotor from .mockdata.gauge import Gauge from .mockdata.dctrl import DCtrl from .mockdata.mpod import MPOD from .mockdata.detectors import AGIPDModule, LPDModule vlen_bytes = h5py.special_dtype(vlen=bytes) def make_metadata(h5file, data_sources, chunksize=16): N = len(data_sources) if N % chunksize: N += chunksize - (N % chunksize) root = [ds.split('/', 1)[0] for ds in data_sources] devices = [ds.split('/', 1)[1] for ds in data_sources] sources_ds = h5file.create_dataset('METADATA/dataSourceId', (N, ), dtype=vlen_bytes, maxshape=(None, )) sources_ds[:len(data_sources)] = data_sources root_ds = h5file.create_dataset('METADATA/root', (N, ),
def extract_flow_to_hdf5(model, example_list, cleanup_tmp_dirs=True): for example_idx, example in enumerate(example_list): # Parse dataset number from string 'video_000010' ==> int(10) dset_index = int(re.findall(r'\d+', example['dset_name'])[-1]) print('Working on example {}/{}...'.format(example_idx + 1, len(example_list))) tmp_image_dir = example['tmp_image_dir'] num_image_files = len(cortex.utils.find_files(tmp_image_dir, 'jpg')) if num_image_files == 0: print('Skipping directory because no image files found: {}'.format( tmp_image_dir)) continue # Intialize the data loader for this video inference_size = [-1, -1] # largest possible dataset = ImagesFromFolderInference(tmp_image_dir, inference_size, extension='jpg') data_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=2, shuffle=False, pin_memory=True) print('Succesfully initialized dataloader with {} image pairs.'.format( len(dataset))) ###################################################################################### ###################################################################################### flow_minmax = [] flow_images = [] num_batches = int(np.ceil(len(dataset) / args.batch_size)) for batch_idx, (data, target) in enumerate(data_loader): # Prepare inputs for forward pass if args.cuda: data, target = [d.cuda(async=True) for d in data ], [t.cuda(async=True) for t in target] data, target = [Variable(d) for d in data], [Variable(t) for t in target] # Actual forward pass through the network with torch.no_grad(): # Shape = [N,2,H,W] output = model(data[0]) # Saving the outputs for example_idx in range(output.shape[0]): flow_single = output[example_idx].data.cpu().numpy().transpose( 1, 2, 0) # Normalize and get 3-channel image flow_u_norm, min_u, max_u = cortex.vision.flow.normalize_flow( flow_single[:, :, 0]) flow_v_norm, min_v, max_v = cortex.vision.flow.normalize_flow( flow_single[:, :, 1]) flow_as_jpg = np.dstack( (flow_u_norm, flow_v_norm, np.zeros_like(flow_u_norm))) flow_as_jpg = (flow_as_jpg * 255.0).astype(np.uint8) # Save results flow_minmax.append((min_u, max_u, min_v, max_v)) flow_images.append(flow_as_jpg) # All batches are done, now write flow frames to HDF5 file with h5py.File(example['hdf5_file'], 'a') as hf: print('Writing results to HDF5 file...') # Create dataset for frames dt_frames = h5py.special_dtype(vlen=np.dtype('uint8')) dset_flow = hf.create_dataset("flow_{:06d}".format(dset_index), shape=(len(flow_images), ), dtype=dt_frames) for frame_idx, flow_image in enumerate(flow_images): # Apply JPG compression to the raw video frame encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 100] frame_jpg_encode = cv2.imencode(".jpg", flow_image, encode_params) frame_jpg_encode = frame_jpg_encode[1].tostring() dset_flow[frame_idx] = np.fromstring(frame_jpg_encode, dtype='uint8') flow_minmax = np.asarray(flow_minmax, np.float64) dset_flow = hf.create_dataset( "flow_minmax_{:06d}".format(dset_index), data=flow_minmax) if cleanup_tmp_dirs: print('cleaning up temporary image directory: {}'.format( tmp_image_dir)) shutil.rmtree(tmp_image_dir) print('#' * 60) print('Done.')
def write(filename, points, cells, point_data=None, cell_data=None, field_data=None, add_global_ids=True): '''Writes H5M files, cf. https://trac.mcs.anl.gov/projects/ITAPS/wiki/MOAB/h5m. ''' import h5py point_data = {} if point_data is None else point_data cell_data = {} if cell_data is None else cell_data field_data = {} if field_data is None else field_data f = h5py.File(filename, 'w') tstt = f.create_group('tstt') # The base index for h5m is 1. global_id = 1 # add nodes nodes = tstt.create_group('nodes') coords = nodes.create_dataset('coordinates', data=points) coords.attrs.create('start_id', global_id) global_id += len(points) # Global tags tstt_tags = tstt.create_group('tags') # The GLOBAL_ID associated with a point is used to identify points if # distributed across several processes. mbpart automatically adds them, # too. if 'GLOBAL_ID' not in point_data and add_global_ids: point_data['GLOBAL_ID'] = numpy.arange( 1, len(points) + 1, ) # add point data if point_data is not None: tags = nodes.create_group('tags') for key, data in point_data.items(): if len(data.shape) == 1: dtype = data.dtype tags.create_dataset(key, data=data) else: # H5M doesn't accept n-x-k arrays as data; it wants an n-x-1 # array with k-tuples as entries. n, k = data.shape dtype = numpy.dtype((data.dtype, (k, ))) dset = tags.create_dataset(key, (n, ), dtype=dtype) dset[:] = data # Create entry in global tags g = tstt_tags.create_group(key) g['type'] = dtype # Add a class tag: # From # <http://lists.mcs.anl.gov/pipermail/moab-dev/2015/007104.html>: # ``` # /* Was dense tag data in mesh database */ # define mhdf_DENSE_TYPE 2 # /** \brief Was sparse tag data in mesh database */ # #define mhdf_SPARSE_TYPE 1 # /** \brief Was bit-field tag data in mesh database */ # #define mhdf_BIT_TYPE 0 # /** \brief Unused */ # #define mhdf_MESH_TYPE 3 # g.attrs['class'] = 2 # add elements elements = tstt.create_group('elements') elem_dt = h5py.special_dtype(enum=('i', { 'Edge': 1, 'Tri': 2, 'Quad': 3, 'Polygon': 4, 'Tet': 5, 'Pyramid': 6, 'Prism': 7, 'Knife': 8, 'Hex': 9, 'Polyhedron': 10 })) tstt['elemtypes'] = elem_dt tstt.create_dataset('history', data=[ __name__.encode('utf-8'), __about__.__version__.encode('utf-8'), str(datetime.now()).encode('utf-8') ]) # number of nodes to h5m name, element type meshio_to_h5m_type = { 'line': { 'name': 'Edge2', 'type': 1 }, 'triangle': { 'name': 'Tri3', 'type': 2 }, 'tetra': { 'name': 'Tet4', 'type': 5 } } for key, data in cells.items(): if key not in meshio_to_h5m_type: logging.warning('Unsupported H5M element type \'%s\'. Skipping.', key) continue this_type = meshio_to_h5m_type[key] elem_group = elements.create_group(this_type['name']) elem_group.attrs.create('element_type', this_type['type'], dtype=elem_dt) # h5m node indices are 1-based conn = elem_group.create_dataset('connectivity', data=(data + 1)) conn.attrs.create('start_id', global_id) global_id += len(data) # add cell data if cell_data: tags = elem_group.create_group('tags') for key, value in cell_data.items(): tags.create_dataset(key, data=value) # add empty set -- MOAB wants this sets = tstt.create_group('sets') sets.create_group('tags') # set max_id tstt.attrs.create('max_id', global_id, dtype='u8') return
# rd_cad = comm.bcast(rd_cad, root=0) # carma task arguments nwalkers = 200 nsteps = 1000 bands = ['r'] # will loop over for each band in list shape = (nwalkers, nsteps) dtype = np.dtype([('LnPosterior', np.float64, shape), ('Chain[0]', np.float64, shape), ('Chain[1]', np.float64, shape), ('rootChain[0]', np.complex128, shape), ('rootChain[1]', np.complex128, shape)]) # hdf5 reference special data type ref_dtype = h5py.special_dtype(ref=h5py.Reference) dt = h5py.special_dtype(vlen=str) def lsst_fit(lc, grp): """Take full mock LC and SDSS cadence to find best_fit params. Args: lc: Kali LC object, full mock LC. grp: HDF5 group storing the MCMC chains. """ best_param = [] # store best-fit params ref_ls = [] task = kali.carma.CARMATask(1, 0,
def step(self, action): ''' Convention says environment outputs np.arrays :param action: LongTensor(batch_size), or np.array(batch_sizelast discrete action chosen :return: ''' try: # in case action is a torch.Tensor action = action.cpu().to_numpy() except: pass self.actions.append(action[:, None]) next_state = action if len(self.actions) < self._max_episode_steps: done = self.codec.is_padding( action) # max index is padding, by convention else: done = np.ones_like(action) == 1 reward = np.zeros_like(action, dtype=np.float) # for those sequences just computed, calculate the reward for i in range(len(action)): if self.done_rewards[i] is None and done[i]: this_action_seq = np.concatenate(self.actions, axis=1)[i:(i + 1), :] this_char_seq = self.codec.actions_to_strings( this_action_seq) # codec expects a batch self.smiles[i] = this_char_seq[0] this_mol = Chem.MolFromSmiles(self.smiles[i]) if this_mol is None: print(self.smiles[i]) # rules = self.codec.grammar.GCFG.productions() # for a in this_action_seq[0]: # print(rules[a]) self.valid[i] = 0 else: self.valid[i] = 1 this_reward = self.reward_fun(this_char_seq)[0] self.done_rewards[i] = this_reward reward[i] = this_reward self.seq_len[i] = len(self.actions) #TODO: put the special string handling into the hdf5 wrapper import h5py dt = h5py.special_dtype( vlen=str) # PY3 hdf5 datatype for variable-length Unicode strings if len(self.actions) == self._max_episode_steps: # dump the whole batch to disk append_data = { 'smiles': np.array(self.smiles, dtype=dt), 'actions': np.concatenate(self.actions, axis=1), 'seq_len': self.seq_len } if self.save_dataset is not None: self.save_dataset.append(append_data) if False and not all(reward == reward): print('failure!') return next_state, reward, done, (self.smiles, self.valid)
def writeVlen(self, data, key='data'): self.makedirs() with h5py.File(self.path) as f: dt = h5py.special_dtype(vlen=np.dtype(data[0].dtype)) f.create_dataset(key, data=data, dtype=dt)
from ...utils import docval, getargs, popargs, call_docval_func from ...data_utils import AbstractDataChunkIterator, get_shape from ...build import Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager,\ RegionBuilder, ReferenceBuilder, TypeMap from ...spec import RefSpec, DtypeSpec, NamespaceCatalog, GroupSpec from ...spec import NamespaceBuilder from .h5_utils import H5ReferenceDataset, H5RegionDataset, H5TableDataset,\ H5DataIO, H5SpecReader, H5SpecWriter from ..io import FORMIO ROOT_NAME = 'root' SPEC_LOC_ATTR = '.specloc' H5_TEXT = special_dtype(vlen=text_type) H5_BINARY = special_dtype(vlen=binary_type) H5_REF = special_dtype(ref=Reference) H5_REGREF = special_dtype(ref=RegionReference) class HDF5IO(FORMIO): @docval({ 'name': 'path', 'type': str, 'doc': 'the path to the HDF5 file' }, { 'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None
def main(args): print('Loading image info from "%s"' % args.images_json) with open(args.images_json, 'r') as f: images = json.load(f) image_id_to_image = {i['image_id']: i for i in images} with open(args.splits_json, 'r') as f: splits = json.load(f) # Filter images for being too small splits = remove_small_images(args, image_id_to_image, splits) obj_aliases = load_aliases(args.object_aliases) rel_aliases = load_aliases(args.relationship_aliases) print('Loading objects from "%s"' % args.objects_json) with open(args.objects_json, 'r') as f: objects = json.load(f) # Vocab for objects and relationships vocab = {} train_ids = splits[args.train_split] create_object_vocab(args, train_ids, objects, obj_aliases, vocab) print('Loading attributes from "%s"' % args.attributes_json) with open(args.attributes_json, 'r') as f: attributes = json.load(f) # Vocab for attributes create_attribute_vocab(args, train_ids, attributes, vocab) object_id_to_obj = filter_objects(args, objects, obj_aliases, vocab, splits) print('After filtering there are %d object instances' % len(object_id_to_obj)) print('Loading relationshps from "%s"' % args.relationships_json) with open(args.relationships_json, 'r') as f: relationships = json.load(f) create_rel_vocab(args, train_ids, relationships, object_id_to_obj, rel_aliases, vocab) print('Encoding objects and relationships ...') numpy_arrays = encode_graphs(args, splits, objects, relationships, vocab, object_id_to_obj, attributes) print('Writing HDF5 output files') for split_name, split_arrays in numpy_arrays.items(): image_ids = list(split_arrays['image_ids'].astype(int)) h5_path = os.path.join(args.output_h5_dir, '%s.h5' % split_name) print('Writing file "%s"' % h5_path) with h5py.File(h5_path, 'w') as h5_file: for name, ary in split_arrays.items(): print('Creating datset: ', name, ary.shape, ary.dtype) h5_file.create_dataset(name, data=ary) print('Writing image paths') image_paths = get_image_paths(image_id_to_image, image_ids) path_dtype = h5py.special_dtype(vlen=str) path_shape = (len(image_paths),) path_dset = h5_file.create_dataset('image_paths', path_shape, dtype=path_dtype) for i, p in enumerate(image_paths): path_dset[i] = p print() print('Writing vocab to "%s"' % args.output_vocab_json) with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f)
def save_attr(group, col, scarf_col, md): d = md.fetch_all(scarf_col) h5[group].create_dataset(col, data=d.astype(h5py.special_dtype(vlen=str)))