def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ args = args if isinstance(args, tuple) else (args,) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) if len(names) != 0: raise TypeError("Field name selections are not allowed for write.") # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. if self.dtype.kind == 'V' and \ (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V'): val = numpy.asarray(val, dtype=self.dtype, order='C') else: val = numpy.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] if val.shape[-len(shp):] != shp: raise TypeError("Can't broadcast to array dimension %s" % (shp,)) mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) mshape = val.shape[0:len(val.shape)-len(shp)] else: mshape = val.shape mtype = None # Perform the dataspace selection selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return # Broadcast scalars if necessary. if (mshape == () and selection.mshape != ()): if self.dtype.subdtype is not None: raise NotImplementedError("Scalar broadcasting is not supported for array dtypes") val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype) val2[...] = val val = val2 mshape = val.shape # Perform the write, with broadcasting # Be careful to pad memory shape with ones to avoid HDF5 chunking # glitch, which kicks in for mismatched memory/file selections if(len(mshape) < len(self.shape)): mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape else: mshape_pad = mshape mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad)) for fspace in selection.broadcast(mshape): self.id.write(mspace, fspace, val, mtype)
def main(): a = DetPulseCoord() fileid = h5f.create(b"test.h5") x = [1, 3, 3] y = [1., 3., 3, 4., 5, 3., 33.] x = ones((100, 3), dtype=int32) y = ones((100, 7), dtype=float32) z = ones((100, 2), dtype=float32) c = [(x[i], y[i], z[i]) for i in range(100)] data = {a.names[0]: x, a.names[1]: y} dspaceid = h5s.create_simple((1, ), (h5s.UNLIMITED, )) # dset = h5d.create(fileid, a.name, a.type, dspaceid) # dset.write() file = File("test.h5") numpytype = dtype([("coord", int32, (3, )), ("pulse", float32, (7, )), ("EZ", float32, (2, ))]) data = array(c, dtype=numpytype) tid = h5t.C_S1.copy() tid.set_size(6) H5T6 = Datatype(tid) tid.set_size(4) H5T_C_S1_4 = Datatype(tid) file.create_dataset("DetPulseCoord", data=data) file.attrs.create("CLASS", "TABLE", dtype=H5T6) file.attrs.create("FIELD_0_NAME", a.names[0]) file.attrs.create("FIELD_1_NAME", a.names[1]) file.attrs.create("TITLE", "Detpulse coord pair data") file.attrs.create("VERSION", "3.0", dtype=H5T_C_S1_4) file.attrs.create("abstime", 1.45e9, dtype=float64, shape=(1, )) file.attrs.create("nevents", 122421, dtype=float64, shape=(1, )) file.attrs.create("runtime", 125000, dtype=float64, shape=(1, )) file.flush()
def test_plugins(self): shape = (32 * 1024,) chunks = (4 * 1024,) dtype = np.int64 data = np.arange(shape[0]) fname = "tmp_test_filters.h5" f = h5py.File(fname) tid = h5t.py_create(dtype, logical=1) sid = h5s.create_simple(shape, shape) # Different API's for different h5py versions. try: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None, None) except TypeError: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None) dcpl.set_filter(32008, h5z.FLAG_MANDATORY) dcpl.set_filter(32000, h5z.FLAG_MANDATORY) dset_id = h5d.create(f.id, "range", tid, sid, dcpl=dcpl) dset_id.write(h5s.ALL, h5s.ALL, data) f.close() # Make sure the filters are working outside of h5py by calling h5dump h5dump = Popen(['h5dump', fname], stdout=PIPE, stderr=STDOUT) stdout, nothing = h5dump.communicate() #print stdout err = h5dump.returncode self.assertEqual(err, 0) f = h5py.File(fname, 'r') d = f['range'][:] self.assertTrue(np.all(d == data)) f.close()
def test_plugins(self): if not H51811P: return shape = (32 * 1024, ) chunks = (4 * 1024, ) dtype = np.int64 data = np.arange(shape[0]) fname = "tmp_test_filters.h5" f = h5py.File(fname) tid = h5t.py_create(dtype, logical=1) sid = h5s.create_simple(shape, shape) # Different API's for different h5py versions. try: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None, None) except TypeError: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None) dcpl.set_filter(32008, h5z.FLAG_MANDATORY) dcpl.set_filter(32000, h5z.FLAG_MANDATORY) dset_id = h5d.create(f.id, b"range", tid, sid, dcpl=dcpl) dset_id.write(h5s.ALL, h5s.ALL, data) f.close() # Make sure the filters are working outside of h5py by calling h5dump h5dump = Popen(['h5dump', fname], stdout=PIPE, stderr=STDOUT) stdout, nothing = h5dump.communicate() err = h5dump.returncode self.assertEqual(err, 0) f = h5py.File(fname, 'r') d = f['range'][:] self.assertTrue(np.all(d == data)) f.close()
def readDataSet( gid, name, dt=np.int32 ): did = h5d.open( gid, name ) space_id = did.get_space() dims = space_id.get_simple_extent_dims() memsp_id = h5s.create_simple( dims ) data = np.zeros(dims, dtype=dt) did.read(memsp_id, space_id, data) return data
def __init__(self, shape, spaceid=None): """ Create a selection. Shape may be None if spaceid is given. """ if spaceid is not None: self._id = spaceid self._shape = spaceid.shape else: shape = tuple(shape) self._shape = shape self._id = h5s.create_simple(shape, (h5s.UNLIMITED, ) * len(shape)) self._id.select_all()
def __init__(self, shape, spaceid=None): """ Create a selection. Shape may be None if spaceid is given. """ if spaceid is not None: self._id = spaceid self._shape = spaceid.shape else: shape = tuple(shape) self._shape = shape self._id = h5s.create_simple(shape, (h5s.UNLIMITED,)*len(shape)) self._id.select_all()
def set_preference(Sfn, preference=None, factor=1.0, mpi=None, verbose=False, debug=False, *args, **kwargs): comm, NPROCS, rank = mpi #Init storage for matrices #Get file name #Open matrix file in parallel mode SSf = h5py.File(Sfn, 'r+', driver='sec2') SSf.atomic = True #Open table with data for clusterization SS = SSf['cluster'] SSs = SS.id.get_space() ms = h5s.create_simple((1, 1)) tS = np.zeros((1, ), dtype=np.float32) ft = np.float32 N, N1 = SS.shape if N != N1: raise ValueError("S must be a square array \ (shape=%s)" % repr((N, N1))) if not preference: try: preference = SS.attrs['median'] except: raise ValueError('Unable to get preference from cluster matrix') preference = ft(preference * factor) #Copy input data and #place preference on diagonal random_state = np.random.RandomState(0) x = np.finfo(ft).eps y = np.finfo(ft).tiny * 100 for i in range(N): tS[0] = preference + (preference * x + y) * random_state.randn() SSs.select_hyperslab((i, i), (1, 1)) SS.id.write(ms, SSs, tS) SS.attrs['preference'] = preference if verbose: print 'Preference: %f' % preference SSf.close()
def create(self, name, data, shape=None, dtype=None): """ Create a new attribute, overwriting any existing attribute. name Name of the new attribute (required) data An array to initialize the attribute (required) shape Shape of the attribute. Overrides data.shape if both are given, in which case the total number of points must be unchanged. dtype Data type of the attribute. Overrides data.dtype if both are given. """ with phil: if data is not None: data = numpy.asarray(data, order="C", dtype=dtype) if shape is None: shape = data.shape elif numpy.product(shape) != numpy.product(data.shape): raise ValueError("Shape of new attribute conflicts with shape of data") if dtype is None: dtype = data.dtype if isinstance(dtype, h5py.Datatype): htype = dtype.id dtype = htype.dtype else: if dtype is None: dtype = numpy.dtype("f") htype = h5t.py_create(dtype, logical=True) if shape is None: raise ValueError('At least one of "shape" or "data" must be given') data = data.reshape(shape) space = h5s.create_simple(shape) if name in self: h5a.delete(self._id, self._e(name)) attr = h5a.create(self._id, self._e(name), htype, space) if data is not None: try: attr.write(data) except: attr._close() h5a.delete(self._id, self._e(name)) raise
def create(self, name, data, shape=None, dtype=None): """ Create a new attribute, overwriting any existing attribute. name Name of the new attribute (required) data An array to initialize the attribute (required) shape Shape of the attribute. Overrides data.shape if both are given, in which case the total number of points must be unchanged. dtype Data type of the attribute. Overrides data.dtype if both are given. """ if data is not None: data = numpy.asarray(data, order='C', dtype=dtype) if shape is None: shape = data.shape elif numpy.product(shape) != numpy.product(data.shape): raise ValueError( "Shape of new attribute conflicts with shape of data") if dtype is None: dtype = data.dtype if isinstance(dtype, h5py.Datatype): htype = dtype.id dtype = htype.dtype else: if dtype is None: dtype = numpy.dtype('f') htype = h5t.py_create(dtype, logical=True) if shape is None: raise ValueError('At least one of "shape" or "data" must be given') data = data.reshape(shape) space = h5s.create_simple(shape) if name in self: h5a.delete(self._id, self._e(name)) attr = h5a.create(self._id, self._e(name), htype, space) if data is not None: try: attr.write(data) except: attr._close() h5a.delete(self._id, self._e(name)) raise
def create(self, name, data, shape=None, dtype=None): """ Create a new attribute, overwriting any existing attribute. name Name of the new attribute (required) data An array to initialize the attribute (required) shape Shape of the attribute. Overrides data.shape if both are given, in which case the total number of points must be unchanged. dtype Data type of the attribute. Overrides data.dtype if both are given. """ # TODO: REMOVE WHEN UNICODE VLENS IMPLEMENTED # Hack to support Unicode values (scalars only) #if isinstance(data, unicode): # unicode_hack = True # data = data.encode('utf8') #else: # unicode_hack = False if data is not None: data = numpy.asarray(data, order='C', dtype=dtype) if shape is None: shape = data.shape elif numpy.product(shape) != numpy.product(data.shape): raise ValueError("Shape of new attribute conflicts with shape of data") if dtype is None: dtype = data.dtype if dtype is None: dtype = numpy.dtype('f') if shape is None: raise ValueError('At least one of "shape" or "data" must be given') data = data.reshape(shape) space = h5s.create_simple(shape) htype = h5t.py_create(dtype, logical=True) # TODO: REMOVE WHEN UNICODE VLENS IMPLEMENTED #if unicode_hack: # htype.set_cset(h5t.CSET_UTF8) if name in self: h5a.delete(self._id, self._e(name)) attr = h5a.create(self._id, self._e(name), htype, space) if data is not None: attr.write(data)
def _write_dset_low(dset, data, arr_slice, collective=False): memory_space = h5s.create_simple(data.shape) file_space = dset.id.get_space() s = (arr_slice[0].start,arr_slice[1].start,arr_slice[2].start) e = (arr_slice[0].stop,arr_slice[1].stop,arr_slice[2].stop) count = tuple([ee - ss for ss,ee in zip(s,e)]) file_space.select_hyperslab(s, count) if collective: dxpl = h5p.create(h5p.DATASET_XFER) dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE) else: dxpl = None dset.id.write(memory_space, file_space, np.ascontiguousarray(data),dxpl=dxpl)
def calc_rmsd_matrix(Sfn, tier=1, mpi=None, verbose=False, noalign=False, *args, **kwargs): if noalign: cl = 'NOSUP_SERIAL_CALCULATOR' else: cl = "KABSCH_SERIAL_CALCULATOR" def calc_diag_chunk(ic, tS, cl): calculator = pyRMSD.RMSDCalculator.RMSDCalculator(cl, ic) rmsd = calculator.pairwiseRMSDMatrix() rmsd_matrix = condensedMatrix.CondensedMatrix(rmsd) ln = len(tS) for i in range(ln): for j in range(i): tS[i, j] = rmsd_matrix[i, j] def calc_chunk(ic, jc, tS, cl): ln, n, d = ic.shape ttS = np.zeros((ln + 1, n, d)) ttS[1:] = jc for i in range(ln): ttS[0] = ic[i] calculator = pyRMSD.RMSDCalculator.RMSDCalculator(cl, ttS) tS[i] = calculator.oneVsFollowing(0) def partition(N, NPROCS, rank): # Partiotioning l = N // NPROCS lr = N % NPROCS if lr > 0 and rank == 0: print('Truncating matrix to %dx%d to fit %d procs' % (l * NPROCS, l * NPROCS, NPROCS)) lN = (NPROCS + 1) * NPROCS / 2 m = lN // NPROCS mr = lN % NPROCS if mr > 0: m = m + 1 if rank % 2 == 0 else m return (l, m) comm, NPROCS, rank = mpi # Reread structures by every process if NPROCS == 1: Sf = h5py.File(Sfn, 'r+', driver='sec2') else: Sf = h5py.File(Sfn, 'r+', driver='mpio', comm=comm) Gn = 'tier%d' % tier G = Sf.require_group(Gn) S = G['struct'] # Count number of structures N = S.len() l, m = partition(N, NPROCS, rank) # HDF5 file # Table for RMSD RM = G.require_dataset('rmsd', (N, N), dtype=np.float32, chunks=(l, l)) RM.attrs['chunk'] = l RMs = RM.id.get_space() # Init calculations tS = np.zeros((l, l), dtype=np.float32) ms = h5s.create_simple((l, l)) i, j = rank, rank ic = S[i * l:(i + 1) * l] jc = ic for c in range(0, m): if rank == 0: tit = time.time() if i == j: calc_diag_chunk(ic, tS, cl) else: calc_chunk(ic, jc, tS, cl) RMs.select_hyperslab((i * l, j * l), (l, l)) RM.id.write(ms, RMs, tS) if rank == 0: teit = time.time() if verbose: print("Step %d of %d T %s" % (c, m, teit - tit)) # Dark magic of task assingment if 0 < (rank - c): j = j - 1 jc = S[j * l:(j + 1) * l] elif rank - c == 0: i = NPROCS - rank - 1 ic = S[i * l:(i + 1) * l] else: j = j + 1 jc = S[j * l:(j + 1) * l] # Wait for all processes comm.Barrier() # Cleanup # Close matrix file Sf.close()
def prepare_cluster_matrix( Sfn, tier=1, mpi=None, verbose=False, *args, **kwargs): def calc_chunk(l, tRM, tCM): ttCM = tRM + tCM * random_state.randn(l, l) return ttCM def calc_chunk_diag(l, tRM, tCM): ttCM = tCM + tCM.transpose() ttRM = tRM + tRM.transpose() ttCM = calc_chunk(l, ttRM, ttCM) return ttCM comm, NPROCS, rank = mpi #Init RMSD matrix #Open matrix file in parallel mode if NPROCS == 1: Sf = h5py.File(Sfn, 'r+', driver='sec2') else: Sf = h5py.File(Sfn, 'r+', driver='mpio', comm=comm) Gn = 'tier%d' % tier G = Sf.require_group(Gn) #Open table with data for clusterization RM = G['rmsd'] RMs = RM.id.get_space() N = RM.len() l = N // NPROCS if rank == 0: N, N1 = RM.shape if N != N1: raise ValueError( "S must be a square array (shape=%s)" % repr(RM.shape)) if RM.attrs['chunk'] % l > 0: raise ValueError( "Wrong chunk size in RMSD matrix") CM = G.require_dataset( 'cluster', (N, N), dtype=np.float32, chunks=(l, l)) CM.attrs['chunk'] = l CMs = CM.id.get_space() random_state = np.random.RandomState(0) x = np.finfo(np.float32).eps y = np.finfo(np.float32).tiny * 100 #Partiotioning lN = (NPROCS + 1) * NPROCS / 2 m = lN // NPROCS mr = lN % NPROCS if mr > 0: m = m + 1 if rank % 2 == 0 else m #Init calculations tRM = np.zeros((l, l), dtype=np.float32) tCM = np.zeros((l, l), dtype=np.float32) ttCM = np.zeros((l, l), dtype=np.float32) ms = h5s.create_simple((l, l)) i, j = rank, rank for c in range(m): if rank == 0: tit = time.time() RMs.select_hyperslab((i * l, j * l), (l, l)) RM.id.read(ms, RMs, tRM) #tRM = -1 * tRM ** 2 tRM **= 2 tRM *= -1 tCM = tRM * x + y if i == j: ttCM = calc_chunk_diag(l, tRM[:], tCM[:]) CMs.select_hyperslab((i * l, j * l), (l, l)) CM.id.write(ms, CMs, ttCM) else: ttCM = calc_chunk(l, tRM[:], tCM[:]) CMs.select_hyperslab((i * l, j * l), (l, l)) CM.id.write(ms, CMs, ttCM) ttCM = calc_chunk(l, tRM.transpose(), tCM.transpose()) CMs.select_hyperslab((j * l, i * l), (l, l)) CM.id.write(ms, CMs, ttCM) if rank == 0: teit = time.time() if verbose: print "Step %d of %d T %s" % (c, m, teit - tit) if (rank - c) > 0: j = j - 1 elif (rank - c) == 0: i = NPROCS - rank - 1 else: j = j + 1 #Wait for all processes comm.Barrier() Sf.close()
print 'Dataset fits memory' comm.Abort() P = comm.bcast(P) N = P.N l = P.l ll = P.ll tb, te = task(rank, l) disk = P.disk damping = P.damping ms_l = h5s.create_simple((N,)) tSl = np.ndarray((N,), dtype=np.float) ms = h5s.create_simple((ll, N)) tS = np.ndarray((ll, N), dtype=np.float) tdS = np.ndarray((1,), dtype=np.float) TMLf = h5py.File(P.TMfn + '_' + str(rank) + '.hdf5', 'w') S = TMLf.create_dataset( 'S', (l, N), dtype=np.float) Ss = S.id.get_space() #Copy input data and #place preference on diagonal
N, N1 = CM.shape if N != N1: raise ValueError("S must be a square array (shape=%s)" % repr(CM.shape)) if l <= 0: raise ValueError("Wrong chunk size in RMSD matrix") #Init calculations #med = livestats.LiveStats() med = lvc_double.Quantile(0.5) madd = np.vectorize(med.add) tCM = np.zeros((N, ), dtype=np.float) ms = h5s.create_simple((N, )) c = 0 for i in xrange(N): #for j in xrange(m): #print 'Processing chunk %d of %d' % (c, m2) #CMs.select_hyperslab((i * l, j * l), (l, l)) CMs.select_hyperslab((i, 0), (1, N)) CM.id.read(ms, CMs, tCM) med.add(tCM) #madd(tCM) # for x in np.nditer(tCM): # med.add(x) c += 1 #level, median = med.quantiles()[0]
def _calc_ci_block(block_label, assignments_filename, kinetics_filename, istate, jstate, start_iter, stop_iter, mcbs_alpha, mcbs_acalpha, mcbs_nsets, extrapolate): log.debug('istate={} jstate={} start_iter={} stop_iter={}'.format(istate,jstate,start_iter,stop_iter)) assignments_file = h5py.File(assignments_filename, 'r') kinetics_file = h5py.File(kinetics_filename, 'r') nstates, nbins = assignments_file.attrs['nstates'], assignments_file.attrs['nbins'] niters = stop_iter - start_iter # Fluxes and populations are averaged as they are read, as these are generally # very large datasets avg_fluxes = numpy.zeros((nstates,nstates,nbins,nbins), weight_dtype) avg_pops = numpy.zeros((nstates,nbins), weight_dtype) # Per-iteration macrostate-macrostate fluxes, for correlation calculation macro_fluxes = numpy.empty((niters, nstates, nstates), weight_dtype) # Source datasets pops_ds = assignments_file['labeled_populations'] fluxes_ds = kinetics_file['labeled_bin_fluxes'] pops_iter_start = pops_ds.attrs.get('iter_start',1) fluxes_iter_start = fluxes_ds.attrs.get('iter_start',1) # prepend 1 so that rank of dest == rank of src labeled_fluxes = numpy.empty((1,nstates,nstates,nbins,nbins), weight_dtype) labeled_pops = numpy.empty((1,nstates,nbins), weight_dtype) lflux_memsel = h5s.create_simple(labeled_fluxes.shape, (h5s.UNLIMITED,)*labeled_fluxes.ndim) lpop_memsel = h5s.create_simple(labeled_pops.shape, (h5s.UNLIMITED,)*labeled_pops.ndim) fluxes_dsid = fluxes_ds.id pops_dsid = pops_ds.id lflux_filesel = fluxes_dsid.get_space() lpop_filesel = pops_dsid.get_space() # Overall average for iiter, n_iter in enumerate(xrange(start_iter, stop_iter)): lflux_filesel.select_hyperslab((n_iter-fluxes_iter_start,0,0,0,0), (1,nstates,nstates,nbins,nbins), op=h5s.SELECT_SET) lpop_filesel.select_hyperslab((n_iter-pops_iter_start,0,0), (1,nstates,nbins), op=h5s.SELECT_SET) fluxes_dsid.read(lflux_memsel, lflux_filesel, labeled_fluxes) pops_dsid.read(lpop_memsel, lpop_filesel, labeled_pops) avg_fluxes += labeled_fluxes[0] avg_pops += labeled_pops[0] macro_fluxes[iiter] = labeled_fluxes[0].sum(axis=3).sum(axis=2) avg_fluxes /= niters avg_pops /= niters avg_rates = labeled_flux_to_rate(avg_fluxes, avg_pops) ss, macro_rates = get_macrostate_rates(avg_rates, avg_pops, extrapolate) overall_avg_rates = macro_rates.copy() ctime = mcbs_correltime(macro_fluxes[istate, jstate], mcbs_acalpha, mcbs_nsets) # bootstrap lbi = int(math.floor(mcbs_nsets*mcbs_alpha/2.0)) ubi = int(math.ceil(mcbs_nsets*(1-mcbs_alpha/2.0))) stride = ctime + 1 synth_rates = numpy.empty((mcbs_nsets,), weight_dtype) starts = numpy.arange(start_iter, stop_iter, stride, dtype=numpy.uintc) stops = numpy.arange(start_iter+stride, stop_iter+stride, stride, dtype=numpy.uintc) nblocks = len(starts) if stops[-1] > stop_iter: stops[-1] = stop_iter for iset in xrange(mcbs_nsets): avg_fluxes.fill(0) avg_pops.fill(0) iters_averaged = 0 log.debug('iset={} istate={} jstate={}'.format(iset,istate,jstate)) for _block in xrange(nblocks): iblock = random.randint(0,nblocks-1) for n_iter in xrange(starts[iblock], stops[iblock]): iters_averaged += 1 lflux_filesel.select_hyperslab((n_iter-fluxes_iter_start,0,0,0,0), (1,nstates,nstates,nbins,nbins), op=h5s.SELECT_SET) lpop_filesel.select_hyperslab((n_iter-pops_iter_start,0,0), (1,nstates,nbins), op=h5s.SELECT_SET) fluxes_dsid.read(lflux_memsel, lflux_filesel, labeled_fluxes) pops_dsid.read(lpop_memsel, lpop_filesel, labeled_pops) avg_fluxes += labeled_fluxes[0] avg_pops += labeled_pops[0] avg_fluxes /= iters_averaged avg_pops /= iters_averaged avg_rates = labeled_flux_to_rate(avg_fluxes, avg_pops) ss, macro_rates = get_macrostate_rates(avg_rates, avg_pops, extrapolate) synth_rates[iset] = macro_rates[istate, jstate] synth_rates.sort() return (block_label, istate, jstate, (start_iter, stop_iter, overall_avg_rates[istate, jstate], synth_rates[lbi], synth_rates[ubi], ctime))
t = parse(pdb_list[0]) na = t.shape[0] na = comm.bcast(na) #Init storage for matrices Sfn = 'aff_struct.hdf5' #HDF5 file Sf = h5py.File(Sfn, 'w', driver='mpio', comm=comm) Sf.atomic = True #Table for RMSD S = Sf.create_dataset('struct', (N, na, nc), dtype=np.float, chunks=(1, na, nc)) Ss = S.id.get_space() tS = np.ndarray((l, na, nc), dtype=np.float32) ms = h5s.create_simple((l, na, nc)) for i in xrange(tb, te): try: print 'Parsing %s' % pdb_list[i] tS[i - tb] = parse(pdb_list[i]) print 'Parsed %s' % pdb_list[i] except: raise ValueError('Broken structure %s' % pdb_list[i]) Ss.select_hyperslab((tb, 0, 0), (l, na, nc)) S.id.write(ms, Ss, tS) #Wait for all processes comm.Barrier() Sf.close()
def create_attribute(_id, _name, _dims, _value): """ Writes a HDF5 string attribute, ASCII, NULLTERM _id should be something like dset.id _dims should be a list. For a scalar, use an empty list [] """ # Make sure we don't have a unicode name _name=str_to_h5(_name) # This routine for string attributes _dtype = h5t.FORTRAN_S1 # Create a scalar space (if dims len=0); otherwise a simple space if len(_dims) == 0: _sid=h5s.create(h5s.SCALAR) elif len(_dims) == 1 and _dims[0] == 0 : _sid=h5s.create(h5s.SCALAR) else: _sid=h5s.create_simple(tuple(_dims)) # endif # Create the memory & file datatypes. Adjust if datatype is string. _mdtype = _dtype.copy() _fdtype = _dtype.copy() _classtype = _dtype.get_class() if _classtype == h5t.STRING: if isinstance(_value, list): _strlen=0 for _part in _value: _strlen=max(_strlen, len(_part)) else: _strlen = len(_value) # endif if _strlen < 1: return None _mdtype.set_size(_strlen) _mdtype.set_strpad(h5t.STR_SPACEPAD) _fdtype.set_size(_strlen+1) _fdtype.set_strpad(h5t.STR_NULLTERM) # endif ## Either add or replace the attribute # if h5a.exists(_id, _name): # _aid = h5a.open(_id, name=_name) # else: # _aid=h5a.create(_id, _name, _fdtype, _sid) # endif # Either add or replace the attribute if h5a.exists(_id, _name): _aid = h5a.delete(_id, name=_name) # endif _aid=h5a.create(_id, _name, _fdtype, _sid) if _classtype == h5t.STRING: if isinstance(_value, list): _value = np.array(_value, dtype=np.string_) else: _value = np.array(str_to_h5(_value)) # endif else: _pytype = _fdtype.dtype _value = np.array(_value, dtype=_pytype) # endif _aid.write(_value) return _aid
def aff_cluster(Sfn, conv_iter=15, max_iter=2000, damping=0.95, mpi=None, verbose=False, debug=False, *args, **kwargs): comm, NPROCS, rank = mpi NPROCS_LOCAL = int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) #Init storage for matrices #Get file name #Open matrix file in parallel mode SSf = h5py.File(Sfn, 'r+', driver='mpio', comm=comm) SSf.atomic = True #Open table with data for clusterization SS = SSf['cluster'] SSs = SS.id.get_space() params = { 'N': 0, 'l': 0, 'll': 0, 'TMfn': '', 'disk': False, 'preference': 0.0 } P = Bunch(params) ft = np.float32 if rank == 0: N, N1 = SS.shape if N != N1: raise ValueError("S must be a square array \ (shape=%s)" % repr((N, N1))) else: P.N = N try: preference = SS.attrs['preference'] except: raise ValueError('Unable to get preference from cluster matrix') if max_iter < 0: raise ValueError('max_iter must be > 0') if not 0 < conv_iter < max_iter: raise ValueError('conv_iter must lie in \ interval between 0 and max_iter') if damping < 0.5 or damping >= 1: raise ValueError('damping must lie in interval between 0.5 and 1') print '#' * 10, 'Main params', '#' * 10 print 'preference: %.3f' % preference print 'damping: %.3f' % damping print 'conv_iter: %d' % conv_iter print 'max_iter: %d' % max_iter print '#' * 31 P.TMbfn = str(uuid.uuid1()) P.TMfn = P.TMbfn + '.hdf5' # Magic 4 to fit MPI.Gather r = N % (NPROCS * 4) N -= r l = N // NPROCS if r > 0: print 'Truncating matrix to %sx%s to fit on %d procs' \ % (N, N, NPROCS) P.N = N # Fit to memory MEM = psutil.virtual_memory().available / NPROCS_LOCAL # MEM = 500 * 10 ** 6 ts = np.dtype(ft).itemsize * N # Python give bits ts *= 8 * 1.1 # Allocate memory for e, tE, and ... # MEM -= ts # ---- tl = int(MEM // ts) # Allocate memory for tS, tA, tR.... def adjust_cache(tl, l): while float(l) % float(tl) > 0: tl -= 1 return tl if tl < l: P.disk = True try: cache = 0 # cache = int(sys.argv[1]) # print sys.argv[1] assert cache < l except: cache = tl #print 'Wrong cache settings, set cache to %d' % tl tl = adjust_cache(tl, l) P.l = l P.ll = tl else: P.l = l P.ll = l if verbose: print "Available memory per process: %.2fG" % (MEM / 10.0**9) print "Memory per row: %.2fM" % (ts / 10.0**6) print "Estimated memory per process: %.2fG" \ % (ts * P.ll / 10.0 ** 9) print 'Cache size is %d of %d' % (P.ll, P.l) P = comm.bcast(P) N = P.N l = P.l ll = P.ll ms = h5s.create_simple((ll, N)) ms_l = h5s.create_simple((N, )) tb, te = task(N, NPROCS, rank) tS = np.ndarray((ll, N), dtype=ft) tSl = np.ndarray((N, ), dtype=ft) disk = P.disk if disk is True: TMLfd = tempfile.mkdtemp() TMLfn = osp(TMLfd, P.TMbfn + '_' + str(rank) + '.hdf5') TMLf = h5py.File(TMLfn, 'w') TMLf.atomic = True S = TMLf.create_dataset('S', (l, N), dtype=ft) Ss = S.id.get_space() #Copy input data and #place preference on diagonal z = -np.finfo(ft).max for i in range(tb, te, ll): SSs.select_hyperslab((i, 0), (ll, N)) SS.id.read(ms, SSs, tS) if disk is True: Ss.select_hyperslab((i - tb, 0), (ll, N)) S.id.write(ms, Ss, tS) if disk is True: R = TMLf.create_dataset('R', (l, N), dtype=ft) Rs = R.id.get_space() tRold = np.zeros((ll, N), dtype=ft) tR = np.zeros((ll, N), dtype=ft) tdR = np.zeros((l, ), dtype=ft) #Shared storage TMf = h5py.File(P.TMfn, 'w', driver='mpio', comm=comm) TMf.atomic = True Rp = TMf.create_dataset('Rp', (N, N), dtype=ft) Rps = Rp.id.get_space() tRp = np.ndarray((ll, N), dtype=ft) tRpa = np.ndarray((N, ll), dtype=ft) A = TMf.create_dataset('A', (N, N), dtype=ft) As = A.id.get_space() tAS = np.ndarray((ll, N), dtype=ft) tAold = np.ndarray((N, ll), dtype=ft) tA = np.ndarray((N, ll), dtype=ft) tdA = np.ndarray((l, ), dtype=ft) e = np.ndarray((N, conv_iter), dtype=np.int8) tE = np.ndarray((N, ), dtype=np.int8) ttE = np.ndarray((l, ), dtype=np.int8) converged = False cK = 0 K = 0 ind = np.arange(ll) for it in range(max_iter): if rank == 0: if verbose is True: print '=' * 10 + 'It %d' % (it) + '=' * 10 tit = time.time() # Compute responsibilities for i in range(tb, te, ll): if disk is True: il = i - tb Ss.select_hyperslab((il, 0), (ll, N)) S.id.read(ms, Ss, tS) #tS = S[i, :] Rs.select_hyperslab((il, 0), (ll, N)) R.id.read(ms, Rs, tRold) else: tRold = tR.copy() As.select_hyperslab((i, 0), (ll, N)) A.id.read(ms, As, tAS) #Tas = a[I, :] tAS += tS #tRold = R[i, :] tI = bn.nanargmax(tAS, axis=1) tY = tAS[ind, tI] tAS[ind, tI[ind]] = z tY2 = bn.nanmax(tAS, axis=1) tR = tS - tY[:, np.newaxis] tR[ind, tI[ind]] = tS[ind, tI[ind]] - tY2[ind] tR = (1 - damping) * tR + damping * tRold tRp = np.maximum(tR, 0) for il in range(ll): tRp[il, i + il] = tR[il, i + il] tdR[i - tb + il] = tR[il, i + il] if disk is True: R.id.write(ms, Rs, tR) #R[i, :] = tR Rps.select_hyperslab((i, 0), (ll, N)) Rp.id.write(ms, Rps, tRp) #Rp[i, :] = tRp if rank == 0: if verbose is True: teit1 = time.time() print 'R T %s' % (teit1 - tit) comm.Barrier() # Compute availabilities for j in range(tb, te, ll): As.select_hyperslab((0, j), (N, ll)) if disk is True: A.id.read(ms, As, tAold) else: tAold = tA.copy() Rps.select_hyperslab((0, j), (N, ll)) Rp.id.read(ms, Rps, tRpa) #tRp = Rp[:, j] tA = bn.nansum(tRpa, axis=0)[np.newaxis, :] - tRpa for jl in range(ll): tdA[j - tb + jl] = tA[j + jl, jl] tA = np.minimum(tA, 0) for jl in range(ll): tA[j + jl, jl] = tdA[j - tb + jl] tA *= (1 - damping) tA += damping * tAold for jl in range(ll): tdA[j - tb + jl] = tA[j + jl, jl] A.id.write(ms, As, tA) if rank == 0: if verbose is True: teit2 = time.time() print 'A T %s' % (teit2 - teit1) ttE = np.array(((tdA + tdR) > 0), dtype=np.int8) if NPROCS > 1: comm.Gather([ttE, MPI.INT], [tE, MPI.INT]) comm.Bcast([tE, MPI.INT]) else: tE = ttE e[:, it % conv_iter] = tE pK = K K = bn.nansum(tE) if rank == 0: if verbose is True: teit = time.time() cc = '' if K == pK: if cK == 0: cK += 1 elif cK > 1: cc = ' Conv %d of %d' % (cK, conv_iter) else: cK = 0 print 'Total K %d T %s%s' % (K, teit - tit, cc) if it >= conv_iter: if rank == 0: se = bn.nansum(e, axis=1) converged = (bn.nansum((se == conv_iter) + (se == 0)) == N) if (converged == np.bool_(True)) and (K > 0): if verbose is True: print("Converged after %d iterations." % (it)) converged = True else: converged = False converged = comm.bcast(converged, root=0) if converged is True: break if not converged and verbose and rank == 0: print("Failed to converge after %d iterations." % (max_iter)) if K > 0: I = np.nonzero(e[:, 0])[0] C = np.zeros((N, ), dtype=np.int) tC = np.zeros((l, ), dtype=np.int) for i in range(l): if disk is True: Ss.select_hyperslab((i, 0), (1, N)) S.id.read(ms_l, Ss, tSl) else: tSl = tS[i] tC[i] = bn.nanargmax(tSl[I]) comm.Gather([tC, MPI.INT], [C, MPI.INT]) if rank == 0: C[I] = np.arange(K) comm.Bcast([C, MPI.INT]) for k in range(K): ii = np.where(C == k)[0] tN = ii.shape[0] tI = np.zeros((tN, ), dtype=np.float32) ttI = np.zeros((tN, ), dtype=np.float32) tttI = np.zeros((tN, ), dtype=np.float32) ms_k = h5s.create_simple((tN, )) j = rank while j < tN: ind = [(ii[i], ii[j]) for i in range(tN)] SSs.select_elements(ind) SS.id.read(ms_k, SSs, tttI) ttI[j] = bn.nansum(tttI) j += NPROCS comm.Reduce([ttI, MPI.FLOAT], [tI, MPI.FLOAT]) if rank == 0: I[k] = ii[bn.nanargmax(tI)] I.sort() comm.Bcast([I, MPI.INT]) for i in range(l): if disk is True: Ss.select_hyperslab((i, 0), (1, N)) S.id.read(ms_l, Ss, tSl) else: tSl = tS[i] tC[i] = bn.nanargmax(tSl[I]) comm.Gather([tC, MPI.INT], [C, MPI.INT]) if rank == 0: C[I] = np.arange(K) else: if rank == 0: I = np.zeros(()) C = np.zeros(()) #Cleanup SSf.close() TMf.close() if disk is True: TMLf.close() shutil.rmtree(TMLfd) comm.Barrier() if rank == 0: os.remove(P.TMfn) if verbose: print 'APN: %d' % K if I.size and C.size: Sf = h5py.File(Sfn, 'r+', driver='sec2') if 'aff_labels' in Sf.keys(): del Sf['aff_labels'] LM = Sf.require_dataset('aff_labels', shape=C.shape, dtype=np.int) LM[:] = C[:] if 'aff_centers' in Sf.keys(): del Sf['aff_centers'] CM = Sf.require_dataset('aff_centers', shape=I.shape, dtype=np.int) CM[:] = I[:] Sf.close()
r = N - l * NPROCS if r != 0: l = l N = N - r print 'Truncating matrix to NxN to fit on %d procs' % NPROCS med = livestats.LiveStats() madd = np.vectorize(med.add) N = comm.bcast(N, root=0) l = comm.bcast(l, root=0) CMs = CM.id.get_space() tCM = np.empty((N, ), dtype=np.float) ms = h5s.create_simple((N, )) tb, te = task(NPROCS - 1 - rank, l) if rank == 0: te -= 1 # Remove degeneracies for i in xrange(tb, te): CMs.select_hyperslab((i, 0), (1, N)) CM.id.read(ms, CMs, tCM) if rank != 0: comm.Send([tCM, MPI.FLOAT], dest=0)
def load_pdb_coords(Sfn, pdb_list, topology=None, mpi=None, verbose=False, *args, **kwargs): def check_pbc(coords, threshold=50): for i in range(len(coords) - 1): assert np.linalg.norm(coords[i] - coords[i + 1]) < threshold def parse_pdb(i): """Parse PDB files""" ps = prody.parsePDB(i) pc = ps.getCoords() check_pbc(pc) return pc @master def estimate_pdb_numatoms(topology): pdb_t = parse_pdb(topology) return pdb_t.shape @master def estimate_coord_shape(ftype='pdb', pdb_list=None, topology=None, NPROCS=1): N = len(pdb_list) r = N % NPROCS if r > 0: N = N - r print 'Truncating number to %d to fit %s procs' % (N, NPROCS) if ftype == 'pdb': if not topology: topology = pdb_list[0] na, nc = estimate_pdb_numatoms(topology) shape = (N, na, nc) return shape @master def load_pdb_names(Sfn, pdb_list, topology=None): N = len(pdb_list) Sf = h5py.File(Sfn, 'r+', driver='sec2') Sf.atomic = True vls = h5py.special_dtype(vlen=str) L = Sf.create_dataset('labels', (N, ), dtype=vls) L[:] = pdb_list[:] if not topology: topology = pdb_list[0] L.attrs['topology'] = topology Sf.close() comm, NPROCS, rank = mpi if len(pdb_list) == 1: ptrn = pdb_list[0] if '*' in ptrn or '?' in ptrn: pdb_list = glob.glob(ptrn) shape = estimate_coord_shape(pdb_list=pdb_list, topology=topology) shape = comm.bcast(shape) N = shape[0] chunk = (1, ) + shape[1:] #Init storage for matrices #HDF5 file Sf = h5py.File(Sfn, 'w', driver='mpio', comm=comm) Sf.atomic = True #Table for RMSD S = Sf.create_dataset('struct', shape, dtype=np.float, chunks=chunk) # A little bit of dark magic for faster io Ss = S.id.get_space() tS = np.ndarray(chunk, dtype=np.float) ms = h5s.create_simple(chunk) tb, te = task(N, NPROCS, rank) for i in range(tb, te): try: tS = parse_pdb(pdb_list[i]) if verbose: print 'Parsed %s' % pdb_list[i] except: raise ValueError('Broken structure %s' % pdb_list[i]) Ss.select_hyperslab((i, 0, 0), chunk) S.id.write(ms, Ss, tS) #Wait for all processes comm.Barrier() Sf.close() load_pdb_names(Sfn, pdb_list[:N])
def make_new_dset(parent, shape=None, dtype=None, data=None, chunks=None, compression=None, shuffle=None, fletcher32=None, maxshape=None, compression_opts=None, fillvalue=None, scaleoffset=None, track_times=None): """ Return a new low-level dataset identifier Only creates anonymous datasets. """ # Convert data to a C-contiguous ndarray if data is not None: import base data = numpy.asarray(data, order="C", dtype=base.guess_dtype(data)) # Validate shape if shape is None: if data is None: raise TypeError("Either data or shape must be specified") shape = data.shape else: shape = tuple(shape) if data is not None and (numpy.product(shape) != numpy.product(data.shape)): raise ValueError("Shape tuple is incompatible with data") tmp_shape = maxshape if maxshape is not None else shape # Validate chunk shape if isinstance(chunks, tuple) and (-numpy.array([ i>=j for i,j in zip(tmp_shape,chunks) if i is not None])).any(): errmsg = "Chunk shape must not be greater than data shape in any dimension. "\ "{} is not compatible with {}".format(chunks, shape) raise ValueError(errmsg) if isinstance(dtype, h5py.Datatype): # Named types are used as-is tid = dtype.id dtype = tid.dtype # Following code needs this else: # Validate dtype if dtype is None and data is None: dtype = numpy.dtype("=f4") elif dtype is None and data is not None: dtype = data.dtype else: dtype = numpy.dtype(dtype) tid = h5t.py_create(dtype, logical=1) # Legacy if any((compression, shuffle, fletcher32, maxshape,scaleoffset)) and chunks is False: raise ValueError("Chunked format required for given storage options") # Legacy if compression is True: if compression_opts is None: compression_opts = 4 compression = 'gzip' # Legacy if compression in _LEGACY_GZIP_COMPRESSION_VALS: if compression_opts is not None: raise TypeError("Conflict in compression options") compression_opts = compression compression = 'gzip' dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts, shuffle, fletcher32, maxshape, scaleoffset) if fillvalue is not None: fillvalue = numpy.array(fillvalue) dcpl.set_fill_value(fillvalue) if track_times in (True, False): dcpl.set_obj_track_times(track_times) elif track_times is not None: raise TypeError("track_times must be either True or False") if maxshape is not None: maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) sid = h5s.create_simple(shape, maxshape) dset_id = h5d.create(parent.id, None, tid, sid, dcpl=dcpl) if data is not None: dset_id.write(h5s.ALL, h5s.ALL, data) return dset_id
def make_new_dset(parent, shape=None, dtype=None, data=None, chunks=None, compression=None, shuffle=None, fletcher32=None, maxshape=None, compression_opts=None, fillvalue=None, scaleoffset=None, track_times=None): """ Return a new low-level dataset identifier Only creates anonymous datasets. """ # Convert data to a C-contiguous ndarray if data is not None: import base data = numpy.asarray(data, order="C", dtype=base.guess_dtype(data)) # Validate shape if shape is None: if data is None: raise TypeError("Either data or shape must be specified") shape = data.shape else: shape = tuple(shape) if data is not None and (numpy.product(shape) != numpy.product( data.shape)): raise ValueError("Shape tuple is incompatible with data") tmp_shape = maxshape if maxshape is not None else shape # Validate chunk shape if isinstance(chunks, tuple) and (-numpy.array( [i >= j for i, j in zip(tmp_shape, chunks) if i is not None])).any(): errmsg = "Chunk shape must not be greater than data shape in any dimension. "\ "{} is not compatible with {}".format(chunks, shape) raise ValueError(errmsg) if isinstance(dtype, h5py.Datatype): # Named types are used as-is tid = dtype.id dtype = tid.dtype # Following code needs this else: # Validate dtype if dtype is None and data is None: dtype = numpy.dtype("=f4") elif dtype is None and data is not None: dtype = data.dtype else: dtype = numpy.dtype(dtype) tid = h5t.py_create(dtype, logical=1) # Legacy if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False: raise ValueError("Chunked format required for given storage options") # Legacy if compression is True: if compression_opts is None: compression_opts = 4 compression = 'gzip' # Legacy if compression in _LEGACY_GZIP_COMPRESSION_VALS: if compression_opts is not None: raise TypeError("Conflict in compression options") compression_opts = compression compression = 'gzip' dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts, shuffle, fletcher32, maxshape, scaleoffset) if fillvalue is not None: fillvalue = numpy.array(fillvalue) dcpl.set_fill_value(fillvalue) if track_times in (True, False): dcpl.set_obj_track_times(track_times) elif track_times is not None: raise TypeError("track_times must be either True or False") if maxshape is not None: maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) sid = h5s.create_simple(shape, maxshape) dset_id = h5d.create(parent.id, None, tid, sid, dcpl=dcpl) if data is not None: dset_id.write(h5s.ALL, h5s.ALL, data) return dset_id
def load_pdb_coords(Sfn, pdb_list, tier=1, topology=None, pbc=True, threshold=10.0, mpi=None, verbose=False, selection='all', *args, **kwargs): def check_pbc(coords, threshold=10.0, selection='all'): for i in range(len(coords) - 1): assert np.linalg.norm(coords[i] - coords[i + 1]) < threshold def parse_pdb(i, pbc=True, threshold=10.0, selection='all'): """Parse PDB files""" ps = prody.parsePDB(i) pc_ = ps.select(selection) if pc_ is None: raise ValueError('Empty selection "%s"' % selection) pc = pc_.getCoords() if pbc: check_pbc(pc, threshold) return pc def estimate_pdb_numatoms(topology, pbc=True, threshold=10.0, selection='all'): pdb_t = parse_pdb(topology, pbc=pbc, threshold=threshold, selection=selection) return pdb_t.shape def estimate_coord_shape( ftype='pdb', pdb_list=None, topology=None, pbc=True, threshold=10.0, selection='all', NPROCS=1, ): N = len(pdb_list) r = N % NPROCS if r > 0: N = N - r print('Truncating number to %d to fit %s procs' % (N, NPROCS)) if ftype == 'pdb': if not topology: topology = pdb_list[0] na, nc = estimate_pdb_numatoms(topology, pbc=pbc, threshold=threshold, selection=selection) shape = (N, na, nc) return shape def load_pdb_names(Sfn, pdb_list, topology=None, tier=1): N = len(pdb_list) Sf = h5py.File(Sfn, 'w', driver='sec2') vls = h5py.special_dtype(vlen=str) Gn = 'tier%d' % tier G = Sf.require_group(Gn) L = G.create_dataset('labels', (N, ), dtype=vls) L[:] = pdb_list[:] if not topology: topology = pdb_list[0] L.attrs['topology'] = topology Sf.close() def load_from_previous_tier(Sfn, tier): Sf = h5py.File(Sfn, 'r+', driver='sec2') PGn = 'tier%d' % (tier - 1) PG = Sf.require_group(PGn) PS = PG['struct'] nstruct, natoms, ncoords = PS.shape PNL = PG['labels'] PC = PG['aff_centers'][:] nstruct = PC.shape[0] shape = (nstruct, natoms, ncoords) chunk = (1, natoms, ncoords) Gn = 'tier%d' % tier G = Sf.require_group(Gn) S = G.require_dataset('struct', shape, dtype=np.float, chunks=chunk) vls = h5py.special_dtype(vlen=str) L = G.require_dataset('labels', (nstruct, ), dtype=vls) for i in range(nstruct): S[i] = PS[PC[i]][:] L[i] = PNL[PC[i]][:] Sf.close() comm, NPROCS, rank = mpi if tier > 1: if rank == 0: load_from_previous_tier(Sfn, tier) return else: return if len(pdb_list) == 1: ptrn = pdb_list[0] if '*' in ptrn or '?' in ptrn: pdb_list = glob.glob(ptrn) pdb_list = natsorted(pdb_list) shape = None if rank == 0: shape = estimate_coord_shape(pdb_list=pdb_list, topology=topology, pbc=pbc, threshold=threshold, selection=selection, NPROCS=NPROCS) N = shape[0] load_pdb_names(Sfn, pdb_list[:N], topology=topology) shape = comm.bcast(shape) N = shape[0] chunk = (1, ) + shape[1:] # Init storage for matrices # HDF5 file if NPROCS == 1: Sf = h5py.File(Sfn, 'r+', driver='sec2') else: Sf = h5py.File(Sfn, 'r+', driver='mpio', comm=comm) # Table for RMSD Gn = 'tier%d' % tier G = Sf.require_group(Gn) S = G.require_dataset('struct', shape, dtype=np.float, chunks=chunk) # A little bit of dark magic for faster io Ss = S.id.get_space() tS = np.ndarray(chunk, dtype=np.float) ms = h5s.create_simple(chunk) tb, te = task(N, NPROCS, rank) for i in range(tb, te): try: tS = parse_pdb(pdb_list[i], pbc=pbc, threshold=threshold, selection=selection) if verbose: print('Parsed %s' % pdb_list[i]) except: raise ValueError('Broken structure %s' % pdb_list[i]) Ss.select_hyperslab((i, 0, 0), chunk) S.id.write(ms, Ss, tS) # Wait for all processes comm.Barrier() Sf.close()
P.l = l P.ll = tl else: P.l = l P.ll = l print 'Cache size is %d of %d' % (P.ll, P.l) print "Estimated memory per process: %.2fG" % (ts * P.ll / 10.0 ** 9) P = comm.bcast(P) N = P.N l = P.l ll = P.ll ms = h5s.create_simple((ll, N)) ms_l = h5s.create_simple((N,)) ms_e = h5s.create_simple((1,)) tb, te = task(rank, l) tS = np.ndarray((ll, N), dtype=ft) tSl = np.ndarray((N,), dtype=ft) tdS = np.ndarray((1,), dtype=ft) disk = P.disk if disk is True: TMLfd = tempfile.mkdtemp() TMLfn = osp(TMLfd, P.TMfn + '_' + str(rank) + '.hdf5')
def __getitem__(self, args, new_dtype=None): """ Read a slice from the HDF5 dataset. Takes slices and recarray-style field names (more than one is allowed!) in any order. Obeys basic NumPy rules, including broadcasting. """ # This boilerplate code is based on h5py.Dataset.__getitem__ args = args if isinstance(args, tuple) else (args, ) if new_dtype is None: new_dtype = getattr(self._local, 'astype', None) # Sort field names from the rest of the args. names = tuple(x for x in args if isinstance(x, str)) if names: # Read a subset of the fields in this structured dtype if len(names) == 1: names = names[0] # Read with simpler dtype of this field args = tuple(x for x in args if not isinstance(x, str)) return self.fields(names, _prior_dtype=new_dtype)[args] if new_dtype is None: new_dtype = self.dtype mtype = h5t.py_create(new_dtype) # === Special-case region references ==== if len(args) == 1 and isinstance(args[0], h5r.RegionReference): obj = h5r.dereference(args[0], self.id) if obj != self.id: raise ValueError("Region reference must point to this dataset") sid = h5r.get_region(args[0], self.id) mshape = guess_shape(sid) if mshape is None: # 0D with no data (NULL or deselected SCALAR) return Empty(new_dtype) out = np.empty(mshape, dtype=new_dtype) if out.size == 0: return out sid_out = h5s.create_simple(mshape) sid_out.select_all() self.id.read(sid_out, sid, out, mtype) return out # === END CODE FROM h5py.Dataset.__getitem__ === idx = ndindex(args).reduce(self.shape) arr = np.ndarray(idx.newshape(self.shape), new_dtype, order='C') for c, index in as_subchunks(idx, self.shape, self.chunks): if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)): raw_idx = Tuple(self.id.data_dict[c], *[slice(0, len(i)) for i in c.args[1:]]).raw a = self.id._read_chunk(raw_idx) self.id.data_dict[c] = a if self.id.data_dict[c].size != 0: arr_idx = c.as_subindex(idx) arr[arr_idx.raw] = self.id.data_dict[c][index.raw] return arr
def create_compact_dataset(loc, name, shape=None, dtype=None, data=None, chunks=None, compression=None, shuffle=None, fletcher32=None, maxshape=None, compression_opts=None, fillvalue=None, scaleoffset=None, track_times=None): """Create a new HDF5 dataset with a compact storage layout.""" # Convert data to a C-contiguous ndarray if data is not None: import h5py._hl.base data = numpy.asarray(data, order="C", dtype=h5py._hl.base.guess_dtype(data)) # Validate shape if shape is None: if data is None: raise TypeError("Either data or shape must be specified") shape = data.shape else: shape = tuple(shape) if data is not None and (numpy.product(shape) != numpy.product(data.shape)): raise ValueError("Shape tuple is incompatible with data") if isinstance(dtype, h5py.Datatype): # Named types are used as-is tid = dtype.id dtype = tid.dtype # Following code needs this else: # Validate dtype if dtype is None and data is None: dtype = numpy.dtype("=f4") elif dtype is None and data is not None: dtype = data.dtype else: dtype = numpy.dtype(dtype) tid = h5t.py_create(dtype, logical=1) # Legacy if any((compression, shuffle, fletcher32, maxshape,scaleoffset)) and chunks is False: raise ValueError("Chunked format required for given storage options") # Legacy if compression is True: if compression_opts is None: compression_opts = 4 compression = 'gzip' # Legacy if compression in range(10): if compression_opts is not None: raise TypeError("Conflict in compression options") compression_opts = compression compression = 'gzip' if h5py.version.version_tuple >= (2, 2, 0, ''): dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts, shuffle, fletcher32, maxshape, None) else: dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts, shuffle, fletcher32, maxshape) if fillvalue is not None: fillvalue = numpy.array(fillvalue) dcpl.set_fill_value(fillvalue) if track_times in (True, False): dcpl.set_obj_track_times(track_times) elif track_times is not None: raise TypeError("track_times must be either True or False") dcpl.set_layout(h5d.COMPACT) if maxshape is not None: maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) sid = h5s.create_simple(shape, maxshape) dset_id = h5d.create(loc.id, None, tid, sid, dcpl=dcpl) if data is not None: dset_id.write(h5s.ALL, h5s.ALL, data) dset = dataset.Dataset(dset_id) if name is not None: loc[name] = dset return dset
def make_new_dset(parent, shape=None, dtype=None, data=None, chunks=None, compression=None, shuffle=None, fletcher32=None, maxshape=None, compression_opts=None, fillvalue=None, scaleoffset=None, track_times=None): """ Return a new low-level dataset identifier Only creates anonymous datasets. """ # Convert data to a C-contiguous ndarray if data is not None: import base data = numpy.asarray(data, order="C", dtype=base.guess_dtype(data)) # Validate shape if shape is None: if data is None: raise TypeError("Either data or shape must be specified") shape = data.shape else: shape = tuple(shape) if data is not None and (numpy.product(shape) != numpy.product(data.shape)): raise ValueError("Shape tuple is incompatible with data") # Validate dtype if dtype is None and data is None: dtype = numpy.dtype("=f4") elif dtype is None and data is not None: dtype = data.dtype else: dtype = numpy.dtype(dtype) # Legacy if any((compression, shuffle, fletcher32, maxshape,scaleoffset)) and chunks is False: raise ValueError("Chunked format required for given storage options") # Legacy if compression is True: if compression_opts is None: compression_opts = 4 compression = 'gzip' # Legacy if compression in range(10): if compression_opts is not None: raise TypeError("Conflict in compression options") compression_opts = compression compression = 'gzip' dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts, shuffle, fletcher32, maxshape, scaleoffset) if fillvalue is not None: fillvalue = numpy.array(fillvalue) dcpl.set_fill_value(fillvalue) if track_times in (True, False): dcpl.set_obj_track_times(track_times) elif track_times is not None: raise TypeError("track_times must be either True or False") if maxshape is not None: maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) sid = h5s.create_simple(shape, maxshape) tid = h5t.py_create(dtype, logical=1) dset_id = h5d.create(parent.id, None, tid, sid, dcpl=dcpl) if data is not None: dset_id.write(h5s.ALL, h5s.ALL, data) return dset_id
def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ args = args if isinstance(args, tuple) else (args, ) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. vlen = h5t.check_dtype(vlen=self.dtype) if vlen not in (bytes, unicode, None): try: val = numpy.asarray(val, dtype=vlen) except ValueError: try: val = numpy.array( [numpy.array(x, dtype=vlen) for x in val], dtype=self.dtype) except ValueError: pass if vlen == val.dtype: if val.ndim > 1: tmp = numpy.empty(shape=val.shape[:-1], dtype=object) tmp.ravel()[:] = [ i for i in val.reshape((numpy.product(val.shape[:-1]), val.shape[-1])) ] else: tmp = numpy.array([None], dtype=object) tmp[0] = val val = tmp elif self.dtype.kind == "O" or \ (self.dtype.kind == 'V' and \ (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \ (self.dtype.subdtype == None)): if len(names) == 1 and self.dtype.fields is not None: # Single field selected for write, from a non-array source if not names[0] in self.dtype.fields: raise ValueError("No such field for indexing: %s" % names[0]) dtype = self.dtype.fields[names[0]][0] cast_compound = True else: dtype = self.dtype cast_compound = False val = numpy.asarray(val, dtype=dtype, order='C') if cast_compound: val = val.astype(numpy.dtype([(names[0], dtype)])) else: val = numpy.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] valshp = val.shape[-len(shp):] if valshp != shp: # Last dimension has to match raise TypeError( "When writing to array types, last N dimensions have to match (got %s, but should be %s)" % ( valshp, shp, )) mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) mshape = val.shape[0:len(val.shape) - len(shp)] # Make a compound memory type if field-name slicing is required elif len(names) != 0: mshape = val.shape # Catch common errors if self.dtype.fields is None: raise TypeError( "Illegal slicing argument (not a compound dataset)") mismatch = [x for x in names if x not in self.dtype.fields] if len(mismatch) != 0: mismatch = ", ".join('"%s"' % x for x in mismatch) raise ValueError( "Illegal slicing argument (fields %s not in dataset type)" % mismatch) # Write non-compound source into a single dataset field if len(names) == 1 and val.dtype.fields is None: subtype = h5y.py_create(val.dtype) mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) mtype.insert(self._e(names[0]), 0, subtype) # Make a new source type keeping only the requested fields else: fieldnames = [x for x in val.dtype.names if x in names] # Keep source order mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) for fieldname in fieldnames: subtype = h5t.py_create(val.dtype.fields[fieldname][0]) offset = val.dtype.fields[fieldname][1] mtype.insert(self._e(fieldname), offset, subtype) # Use mtype derived from array (let DatasetID.write figure it out) else: mshape = val.shape mtype = None # Perform the dataspace selection selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return # Broadcast scalars if necessary. if (mshape == () and selection.mshape != ()): if self.dtype.subdtype is not None: raise TypeError( "Scalar broadcasting is not supported for array dtypes") val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype) val2[...] = val val = val2 mshape = val.shape # Perform the write, with broadcasting # Be careful to pad memory shape with ones to avoid HDF5 chunking # glitch, which kicks in for mismatched memory/file selections if (len(mshape) < len(self.shape)): mshape_pad = (1, ) * (len(self.shape) - len(mshape)) + mshape else: mshape_pad = mshape mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED, ) * len(mshape_pad)) for fspace in selection.broadcast(mshape): self.id.write(mspace, fspace, val, mtype)
def _calc_ci_block(block_label, assignments_filename, kinetics_filename, istate, jstate, start_iter, stop_iter, mcbs_alpha, mcbs_acalpha, mcbs_nsets, extrapolate): log.debug('istate={} jstate={} start_iter={} stop_iter={}'.format( istate, jstate, start_iter, stop_iter)) assignments_file = h5py.File(assignments_filename, 'r') kinetics_file = h5py.File(kinetics_filename, 'r') nstates, nbins = assignments_file.attrs['nstates'], assignments_file.attrs[ 'nbins'] niters = stop_iter - start_iter # Fluxes and populations are averaged as they are read, as these are generally # very large datasets avg_fluxes = numpy.zeros((nstates, nstates, nbins, nbins), weight_dtype) avg_pops = numpy.zeros((nstates, nbins), weight_dtype) # Per-iteration macrostate-macrostate fluxes, for correlation calculation macro_fluxes = numpy.empty((niters, nstates, nstates), weight_dtype) # Source datasets pops_ds = assignments_file['labeled_populations'] fluxes_ds = kinetics_file['labeled_bin_fluxes'] pops_iter_start = pops_ds.attrs.get('iter_start', 1) fluxes_iter_start = fluxes_ds.attrs.get('iter_start', 1) # prepend 1 so that rank of dest == rank of src labeled_fluxes = numpy.empty((1, nstates, nstates, nbins, nbins), weight_dtype) labeled_pops = numpy.empty((1, nstates, nbins), weight_dtype) lflux_memsel = h5s.create_simple(labeled_fluxes.shape, (h5s.UNLIMITED, ) * labeled_fluxes.ndim) lpop_memsel = h5s.create_simple(labeled_pops.shape, (h5s.UNLIMITED, ) * labeled_pops.ndim) fluxes_dsid = fluxes_ds.id pops_dsid = pops_ds.id lflux_filesel = fluxes_dsid.get_space() lpop_filesel = pops_dsid.get_space() # Overall average for iiter, n_iter in enumerate(xrange(start_iter, stop_iter)): lflux_filesel.select_hyperslab( (n_iter - fluxes_iter_start, 0, 0, 0, 0), (1, nstates, nstates, nbins, nbins), op=h5s.SELECT_SET) lpop_filesel.select_hyperslab((n_iter - pops_iter_start, 0, 0), (1, nstates, nbins), op=h5s.SELECT_SET) fluxes_dsid.read(lflux_memsel, lflux_filesel, labeled_fluxes) pops_dsid.read(lpop_memsel, lpop_filesel, labeled_pops) avg_fluxes += labeled_fluxes[0] avg_pops += labeled_pops[0] macro_fluxes[iiter] = labeled_fluxes[0].sum(axis=3).sum(axis=2) avg_fluxes /= niters avg_pops /= niters avg_rates = labeled_flux_to_rate(avg_fluxes, avg_pops) ss, macro_rates = get_macrostate_rates(avg_rates, avg_pops, extrapolate) overall_avg_rates = macro_rates.copy() ctime = mcbs_correltime(macro_fluxes[istate, jstate], mcbs_acalpha, mcbs_nsets) # bootstrap lbi = int(math.floor(mcbs_nsets * mcbs_alpha / 2.0)) ubi = int(math.ceil(mcbs_nsets * (1 - mcbs_alpha / 2.0))) stride = ctime + 1 synth_rates = numpy.empty((mcbs_nsets, ), weight_dtype) starts = numpy.arange(start_iter, stop_iter, stride, dtype=numpy.uintc) stops = numpy.arange(start_iter + stride, stop_iter + stride, stride, dtype=numpy.uintc) nblocks = len(starts) if stops[-1] > stop_iter: stops[-1] = stop_iter for iset in xrange(mcbs_nsets): avg_fluxes.fill(0) avg_pops.fill(0) iters_averaged = 0 log.debug('iset={} istate={} jstate={}'.format(iset, istate, jstate)) for _block in xrange(nblocks): iblock = random.randint(0, nblocks - 1) for n_iter in xrange(starts[iblock], stops[iblock]): iters_averaged += 1 lflux_filesel.select_hyperslab( (n_iter - fluxes_iter_start, 0, 0, 0, 0), (1, nstates, nstates, nbins, nbins), op=h5s.SELECT_SET) lpop_filesel.select_hyperslab((n_iter - pops_iter_start, 0, 0), (1, nstates, nbins), op=h5s.SELECT_SET) fluxes_dsid.read(lflux_memsel, lflux_filesel, labeled_fluxes) pops_dsid.read(lpop_memsel, lpop_filesel, labeled_pops) avg_fluxes += labeled_fluxes[0] avg_pops += labeled_pops[0] avg_fluxes /= iters_averaged avg_pops /= iters_averaged avg_rates = labeled_flux_to_rate(avg_fluxes, avg_pops) ss, macro_rates = get_macrostate_rates(avg_rates, avg_pops, extrapolate) synth_rates[iset] = macro_rates[istate, jstate] synth_rates.sort() return (block_label, istate, jstate, (start_iter, stop_iter, overall_avg_rates[istate, jstate], synth_rates[lbi], synth_rates[ubi], ctime))
def __getitem__(self, args): """ Read a slice from the HDF5 dataset. Takes slices and recarray-style field names (more than one is allowed!) in any order. Obeys basic NumPy rules, including broadcasting. Also supports: * Boolean "mask" array indexing """ args = args if isinstance(args, tuple) else (args,) # Sort field indices from the rest of the args. names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) def strip_fields(basetype): """ Strip extra dtype information from special types """ if basetype.kind == 'O': return numpy.dtype('O') if basetype.fields is not None: if basetype.kind in ('i','u'): return basetype.fields['enum'][0] fields = [] for name in basetype.names: fff = basetype.fields[name] if len(fff) == 3: (subtype, offset, meta) = fff else: subtype, meta = fff offset = 0 subtype = strip_fields(subtype) fields.append((name, subtype)) return numpy.dtype(fields) return basetype def readtime_dtype(basetype, names): """ Make a NumPy dtype appropriate for reading """ basetype = strip_fields(basetype) if len(names) == 0: # Not compound, or we want all fields return basetype if basetype.names is None: # Names provided, but not compound raise ValueError("Field names only allowed for compound types") for name in names: # Check all names are legal if not name in basetype.names: raise ValueError("Field %s does not appear in this type." % name) return numpy.dtype([(name, basetype.fields[name][0]) for name in names]) # This is necessary because in the case of array types, NumPy # discards the array information at the top level. new_dtype = readtime_dtype(self.id.dtype, names) mtype = h5t.py_create(new_dtype) # === Scalar dataspaces ================= if self.shape == (): fspace = self.id.get_space() selection = sel2.select_read(fspace, args) arr = numpy.ndarray(selection.mshape, dtype=new_dtype) for mspace, fspace in selection: self.id.read(mspace, fspace, arr, mtype) if selection.mshape is None: return arr[()] return arr # === Everything else =================== # Perform the dataspace selection. selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return numpy.ndarray((0,), dtype=new_dtype) # Up-converting to (1,) so that numpy.ndarray correctly creates # np.void rows in case of multi-field dtype. (issue 135) single_element = selection.mshape == () mshape = (1,) if single_element else selection.mshape arr = numpy.ndarray(mshape, new_dtype, order='C') # HDF5 has a bug where if the memory shape has a different rank # than the dataset, the read is very slow if len(mshape) < len(self.shape): # pad with ones mshape = (1,)*(len(self.shape)-len(mshape)) + mshape # Perfom the actual read mspace = h5s.create_simple(mshape) fspace = selection._id self.id.read(mspace, fspace, arr, mtype) # Patch up the output for NumPy if len(names) == 1: arr = arr[names[0]] # Single-field recarray convention if arr.shape == (): arr = numpy.asscalar(arr) if single_element: arr = arr[0] return arr
def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ args = args if isinstance(args, tuple) else (args,) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. if self.dtype.kind == "O" or \ (self.dtype.kind == 'V' and \ (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \ (self.dtype.subdtype == None)): if len(names) == 1 and self.dtype.fields is not None: # Single field selected for write, from a non-array source if not names[0] in self.dtype.fields: raise ValueError("No such field for indexing: %s" % names[0]) dtype = self.dtype.fields[names[0]][0] cast_compound = True else: dtype = self.dtype cast_compound = False val = numpy.asarray(val, dtype=dtype, order='C') if cast_compound: val = val.astype(numpy.dtype([(names[0], dtype)])) else: val = numpy.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] valshp = val.shape[-len(shp):] if valshp != shp: # Last dimension has to match raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,)) mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) mshape = val.shape[0:len(val.shape)-len(shp)] # Make a compound memory type if field-name slicing is required elif len(names) != 0: mshape = val.shape # Catch common errors if self.dtype.fields is None: raise TypeError("Illegal slicing argument (not a compound dataset)") mismatch = [x for x in names if x not in self.dtype.fields] if len(mismatch) != 0: mismatch = ", ".join('"%s"'%x for x in mismatch) raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch) # Write non-compound source into a single dataset field if len(names) == 1 and val.dtype.fields is None: subtype = h5y.py_create(val.dtype) mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) mtype.insert(self._e(names[0]), 0, subtype) # Make a new source type keeping only the requested fields else: fieldnames = [x for x in val.dtype.names if x in names] # Keep source order mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) for fieldname in fieldnames: subtype = h5t.py_create(val.dtype.fields[fieldname][0]) offset = val.dtype.fields[fieldname][1] mtype.insert(self._e(fieldname), offset, subtype) # Use mtype derived from array (let DatasetID.write figure it out) else: mshape = val.shape mtype = None # Perform the dataspace selection selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return # Broadcast scalars if necessary. if (mshape == () and selection.mshape != ()): if self.dtype.subdtype is not None: raise TypeError("Scalar broadcasting is not supported for array dtypes") val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype) val2[...] = val val = val2 mshape = val.shape # Perform the write, with broadcasting # Be careful to pad memory shape with ones to avoid HDF5 chunking # glitch, which kicks in for mismatched memory/file selections if(len(mshape) < len(self.shape)): mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape else: mshape_pad = mshape mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad)) for fspace in selection.broadcast(mshape): self.id.write(mspace, fspace, val, mtype)
def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ args = args if isinstance(args, tuple) else (args,) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) if len(names) != 0: raise TypeError("Field name selections are not allowed for write.") # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. if self.dtype.kind == "O" or \ (self.dtype.kind == 'V' and \ (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \ (self.dtype.subdtype == None)): val = numpy.asarray(val, dtype=self.dtype, order='C') else: val = numpy.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] valshp = val.shape[-len(shp):] if valshp != shp: # Last dimension has to match raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,)) mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) mshape = val.shape[0:len(val.shape)-len(shp)] else: mshape = val.shape mtype = None # Perform the dataspace selection selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return # Broadcast scalars if necessary. if (mshape == () and selection.mshape != ()): if self.dtype.subdtype is not None: raise TypeError("Scalar broadcasting is not supported for array dtypes") val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype) val2[...] = val val = val2 mshape = val.shape # Perform the write, with broadcasting # Be careful to pad memory shape with ones to avoid HDF5 chunking # glitch, which kicks in for mismatched memory/file selections if(len(mshape) < len(self.shape)): mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape else: mshape_pad = mshape mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad)) for fspace in selection.broadcast(mshape): self.id.write(mspace, fspace, val, mtype)
#RMf = h5py.File(fid) RMf = h5py.File(RMfn, 'w', driver='mpio', comm=comm) RMf.atomic = True #Table for RMSD RM = RMf.create_dataset( 'rmsd', (N, N), dtype=np.float, chunks=(l, l)) RM.attrs['chunk'] = l RMs = RM.id.get_space() #Init calculations tS = np.zeros((l, l), dtype=np.float) ms = h5s.create_simple((l, l)) i, j = rank, rank ic = S[i * l: (i + 1) * l] jc = ic for c in xrange(0, m): if rank == 0: tit = time.time() try: assert i == j calc_diag_chunk(ic, tS) except AssertionError: calc_chunk(ic, jc, tS)