Пример #1
0
    def __setitem__(self, args, val):
        """ Write to the HDF5 dataset from a Numpy array.

        NumPy's broadcasting rules are honored, for "simple" indexing
        (slices and integers).  For advanced indexing, the shapes must
        match.
        """
        args = args if isinstance(args, tuple) else (args,)

        # Sort field indices from the slicing
        names = tuple(x for x in args if isinstance(x, str))
        args = tuple(x for x in args if not isinstance(x, str))

        if len(names) != 0:
            raise TypeError("Field name selections are not allowed for write.")

        # Generally we try to avoid converting the arrays on the Python
        # side.  However, for compound literals this is unavoidable.
        if self.dtype.kind == 'V' and \
        (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V'):
            val = numpy.asarray(val, dtype=self.dtype, order='C')
        else:
            val = numpy.asarray(val, order='C')

        # Check for array dtype compatibility and convert
        if self.dtype.subdtype is not None:
            shp = self.dtype.subdtype[1]
            if val.shape[-len(shp):] != shp:
                raise TypeError("Can't broadcast to array dimension %s" % (shp,))
            mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
            mshape = val.shape[0:len(val.shape)-len(shp)]
        else:
            mshape = val.shape
            mtype = None

        # Perform the dataspace selection
        selection = sel.select(self.shape, args, dsid=self.id)

        if selection.nselect == 0:
            return

        # Broadcast scalars if necessary.
        if (mshape == () and selection.mshape != ()):
            if self.dtype.subdtype is not None:
                raise NotImplementedError("Scalar broadcasting is not supported for array dtypes")
            val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype)
            val2[...] = val
            val = val2
            mshape = val.shape

        # Perform the write, with broadcasting
        # Be careful to pad memory shape with ones to avoid HDF5 chunking
        # glitch, which kicks in for mismatched memory/file selections
        if(len(mshape) < len(self.shape)):
            mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape
        else:
            mshape_pad = mshape
        mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad))
        for fspace in selection.broadcast(mshape):
            self.id.write(mspace, fspace, val, mtype)
Пример #2
0
def main():
    a = DetPulseCoord()
    fileid = h5f.create(b"test.h5")
    x = [1, 3, 3]
    y = [1., 3., 3, 4., 5, 3., 33.]
    x = ones((100, 3), dtype=int32)
    y = ones((100, 7), dtype=float32)
    z = ones((100, 2), dtype=float32)
    c = [(x[i], y[i], z[i]) for i in range(100)]
    data = {a.names[0]: x, a.names[1]: y}
    dspaceid = h5s.create_simple((1, ), (h5s.UNLIMITED, ))
    # dset = h5d.create(fileid, a.name, a.type, dspaceid)
    # dset.write()
    file = File("test.h5")
    numpytype = dtype([("coord", int32, (3, )), ("pulse", float32, (7, )),
                       ("EZ", float32, (2, ))])
    data = array(c, dtype=numpytype)
    tid = h5t.C_S1.copy()
    tid.set_size(6)
    H5T6 = Datatype(tid)
    tid.set_size(4)
    H5T_C_S1_4 = Datatype(tid)
    file.create_dataset("DetPulseCoord", data=data)
    file.attrs.create("CLASS", "TABLE", dtype=H5T6)
    file.attrs.create("FIELD_0_NAME", a.names[0])
    file.attrs.create("FIELD_1_NAME", a.names[1])
    file.attrs.create("TITLE", "Detpulse coord pair data")

    file.attrs.create("VERSION", "3.0", dtype=H5T_C_S1_4)
    file.attrs.create("abstime", 1.45e9, dtype=float64, shape=(1, ))
    file.attrs.create("nevents", 122421, dtype=float64, shape=(1, ))
    file.attrs.create("runtime", 125000, dtype=float64, shape=(1, ))
    file.flush()
Пример #3
0
    def test_plugins(self):
        shape = (32 * 1024,)
        chunks = (4 * 1024,)
        dtype = np.int64
        data = np.arange(shape[0])
        fname = "tmp_test_filters.h5"
        f = h5py.File(fname)
        tid = h5t.py_create(dtype, logical=1)
        sid = h5s.create_simple(shape, shape)
        # Different API's for different h5py versions.
        try:
            dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None,
                      None, None, None, None)
        except TypeError:
            dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None,
                      None, None, None)
        dcpl.set_filter(32008, h5z.FLAG_MANDATORY)
        dcpl.set_filter(32000, h5z.FLAG_MANDATORY)
        dset_id = h5d.create(f.id, "range", tid, sid, dcpl=dcpl)
        dset_id.write(h5s.ALL, h5s.ALL, data)
        f.close()

        # Make sure the filters are working outside of h5py by calling h5dump
        h5dump = Popen(['h5dump', fname],
                       stdout=PIPE, stderr=STDOUT)
        stdout, nothing = h5dump.communicate()
        #print stdout
        err = h5dump.returncode
        self.assertEqual(err, 0)

        f = h5py.File(fname, 'r')
        d = f['range'][:]
        self.assertTrue(np.all(d == data))
        f.close()
Пример #4
0
    def test_plugins(self):
        if not H51811P:
            return
        shape = (32 * 1024, )
        chunks = (4 * 1024, )
        dtype = np.int64
        data = np.arange(shape[0])
        fname = "tmp_test_filters.h5"
        f = h5py.File(fname)
        tid = h5t.py_create(dtype, logical=1)
        sid = h5s.create_simple(shape, shape)
        # Different API's for different h5py versions.
        try:
            dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None,
                                         None, None, None, None)
        except TypeError:
            dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None,
                                         None, None, None)
        dcpl.set_filter(32008, h5z.FLAG_MANDATORY)
        dcpl.set_filter(32000, h5z.FLAG_MANDATORY)
        dset_id = h5d.create(f.id, b"range", tid, sid, dcpl=dcpl)
        dset_id.write(h5s.ALL, h5s.ALL, data)
        f.close()

        # Make sure the filters are working outside of h5py by calling h5dump
        h5dump = Popen(['h5dump', fname], stdout=PIPE, stderr=STDOUT)
        stdout, nothing = h5dump.communicate()
        err = h5dump.returncode
        self.assertEqual(err, 0)

        f = h5py.File(fname, 'r')
        d = f['range'][:]
        self.assertTrue(np.all(d == data))
        f.close()
Пример #5
0
def readDataSet( gid, name, dt=np.int32 ):
  did = h5d.open( gid, name )
  space_id = did.get_space()
  dims = space_id.get_simple_extent_dims()
  memsp_id = h5s.create_simple( dims )
  data = np.zeros(dims, dtype=dt)
  did.read(memsp_id, space_id, data)
  return data
Пример #6
0
 def __init__(self, shape, spaceid=None):
     """ Create a selection.  Shape may be None if spaceid is given. """
     if spaceid is not None:
         self._id = spaceid
         self._shape = spaceid.shape
     else:
         shape = tuple(shape)
         self._shape = shape
         self._id = h5s.create_simple(shape, (h5s.UNLIMITED, ) * len(shape))
         self._id.select_all()
Пример #7
0
 def __init__(self, shape, spaceid=None):
     """ Create a selection.  Shape may be None if spaceid is given. """
     if spaceid is not None:
         self._id = spaceid
         self._shape = spaceid.shape
     else:
         shape = tuple(shape)
         self._shape = shape
         self._id = h5s.create_simple(shape, (h5s.UNLIMITED,)*len(shape))
         self._id.select_all()
Пример #8
0
def set_preference(Sfn,
                   preference=None,
                   factor=1.0,
                   mpi=None,
                   verbose=False,
                   debug=False,
                   *args,
                   **kwargs):

    comm, NPROCS, rank = mpi

    #Init storage for matrices
    #Get file name
    #Open matrix file in parallel mode
    SSf = h5py.File(Sfn, 'r+', driver='sec2')
    SSf.atomic = True
    #Open table with data for clusterization
    SS = SSf['cluster']
    SSs = SS.id.get_space()
    ms = h5s.create_simple((1, 1))
    tS = np.zeros((1, ), dtype=np.float32)

    ft = np.float32

    N, N1 = SS.shape

    if N != N1:
        raise ValueError("S must be a square array \
            (shape=%s)" % repr((N, N1)))

    if not preference:
        try:
            preference = SS.attrs['median']
        except:
            raise ValueError('Unable to get preference from cluster matrix')

    preference = ft(preference * factor)

    #Copy input data and
    #place preference on diagonal
    random_state = np.random.RandomState(0)
    x = np.finfo(ft).eps
    y = np.finfo(ft).tiny * 100

    for i in range(N):
        tS[0] = preference + (preference * x + y) * random_state.randn()
        SSs.select_hyperslab((i, i), (1, 1))
        SS.id.write(ms, SSs, tS)

    SS.attrs['preference'] = preference

    if verbose:
        print 'Preference: %f' % preference

    SSf.close()
Пример #9
0
    def create(self, name, data, shape=None, dtype=None):
        """ Create a new attribute, overwriting any existing attribute.

        name
            Name of the new attribute (required)
        data
            An array to initialize the attribute (required)
        shape
            Shape of the attribute.  Overrides data.shape if both are
            given, in which case the total number of points must be unchanged.
        dtype
            Data type of the attribute.  Overrides data.dtype if both
            are given.
        """

        with phil:
            if data is not None:
                data = numpy.asarray(data, order="C", dtype=dtype)
                if shape is None:
                    shape = data.shape
                elif numpy.product(shape) != numpy.product(data.shape):
                    raise ValueError("Shape of new attribute conflicts with shape of data")

                if dtype is None:
                    dtype = data.dtype

            if isinstance(dtype, h5py.Datatype):
                htype = dtype.id
                dtype = htype.dtype
            else:
                if dtype is None:
                    dtype = numpy.dtype("f")
                htype = h5t.py_create(dtype, logical=True)

            if shape is None:
                raise ValueError('At least one of "shape" or "data" must be given')

            data = data.reshape(shape)

            space = h5s.create_simple(shape)

            if name in self:
                h5a.delete(self._id, self._e(name))

            attr = h5a.create(self._id, self._e(name), htype, space)

            if data is not None:
                try:
                    attr.write(data)
                except:
                    attr._close()
                    h5a.delete(self._id, self._e(name))
                    raise
Пример #10
0
    def create(self, name, data, shape=None, dtype=None):
        """ Create a new attribute, overwriting any existing attribute.

        name
            Name of the new attribute (required)
        data
            An array to initialize the attribute (required)
        shape
            Shape of the attribute.  Overrides data.shape if both are
            given, in which case the total number of points must be unchanged.
        dtype
            Data type of the attribute.  Overrides data.dtype if both
            are given.
        """

        if data is not None:
            data = numpy.asarray(data, order='C', dtype=dtype)
            if shape is None:
                shape = data.shape
            elif numpy.product(shape) != numpy.product(data.shape):
                raise ValueError(
                    "Shape of new attribute conflicts with shape of data")

            if dtype is None:
                dtype = data.dtype

        if isinstance(dtype, h5py.Datatype):
            htype = dtype.id
            dtype = htype.dtype
        else:
            if dtype is None:
                dtype = numpy.dtype('f')
            htype = h5t.py_create(dtype, logical=True)

        if shape is None:
            raise ValueError('At least one of "shape" or "data" must be given')

        data = data.reshape(shape)

        space = h5s.create_simple(shape)

        if name in self:
            h5a.delete(self._id, self._e(name))

        attr = h5a.create(self._id, self._e(name), htype, space)

        if data is not None:
            try:
                attr.write(data)
            except:
                attr._close()
                h5a.delete(self._id, self._e(name))
                raise
Пример #11
0
    def create(self, name, data, shape=None, dtype=None):
        """ Create a new attribute, overwriting any existing attribute.

        name
            Name of the new attribute (required)
        data
            An array to initialize the attribute (required)
        shape
            Shape of the attribute.  Overrides data.shape if both are
            given, in which case the total number of points must be unchanged.
        dtype
            Data type of the attribute.  Overrides data.dtype if both
            are given.
        """
        # TODO: REMOVE WHEN UNICODE VLENS IMPLEMENTED
        # Hack to support Unicode values (scalars only)
        #if isinstance(data, unicode):
        #    unicode_hack = True
        #    data = data.encode('utf8')
        #else:
        #    unicode_hack = False

        if data is not None:
            data = numpy.asarray(data, order='C', dtype=dtype)
            if shape is None:
                shape = data.shape
            elif numpy.product(shape) != numpy.product(data.shape):
                raise ValueError("Shape of new attribute conflicts with shape of data")
                
            if dtype is None:
                dtype = data.dtype

        if dtype is None:
            dtype = numpy.dtype('f')
        if shape is None:
            raise ValueError('At least one of "shape" or "data" must be given')

        data = data.reshape(shape)

        space = h5s.create_simple(shape)
        htype = h5t.py_create(dtype, logical=True)

        # TODO: REMOVE WHEN UNICODE VLENS IMPLEMENTED
        #if unicode_hack:
        #    htype.set_cset(h5t.CSET_UTF8)

        if name in self:
            h5a.delete(self._id, self._e(name))

        attr = h5a.create(self._id, self._e(name), htype, space)
        if data is not None:
            attr.write(data)
Пример #12
0
def _write_dset_low(dset, data, arr_slice, collective=False):
    memory_space = h5s.create_simple(data.shape)
    file_space = dset.id.get_space()

    s = (arr_slice[0].start,arr_slice[1].start,arr_slice[2].start)
    e = (arr_slice[0].stop,arr_slice[1].stop,arr_slice[2].stop)

    count = tuple([ee - ss for ss,ee in zip(s,e)])

    file_space.select_hyperslab(s, count)

    if collective:
        dxpl = h5p.create(h5p.DATASET_XFER)
        dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE)
    else:
        dxpl = None

    dset.id.write(memory_space, file_space,
            np.ascontiguousarray(data),dxpl=dxpl)
Пример #13
0
def calc_rmsd_matrix(Sfn,
                     tier=1,
                     mpi=None,
                     verbose=False,
                     noalign=False,
                     *args,
                     **kwargs):

    if noalign:
        cl = 'NOSUP_SERIAL_CALCULATOR'
    else:
        cl = "KABSCH_SERIAL_CALCULATOR"

    def calc_diag_chunk(ic, tS, cl):
        calculator = pyRMSD.RMSDCalculator.RMSDCalculator(cl, ic)
        rmsd = calculator.pairwiseRMSDMatrix()
        rmsd_matrix = condensedMatrix.CondensedMatrix(rmsd)
        ln = len(tS)
        for i in range(ln):
            for j in range(i):
                tS[i, j] = rmsd_matrix[i, j]

    def calc_chunk(ic, jc, tS, cl):
        ln, n, d = ic.shape
        ttS = np.zeros((ln + 1, n, d))
        ttS[1:] = jc
        for i in range(ln):
            ttS[0] = ic[i]
            calculator = pyRMSD.RMSDCalculator.RMSDCalculator(cl, ttS)
            tS[i] = calculator.oneVsFollowing(0)

    def partition(N, NPROCS, rank):
        # Partiotioning
        l = N // NPROCS
        lr = N % NPROCS

        if lr > 0 and rank == 0:
            print('Truncating matrix to %dx%d to fit %d procs' %
                  (l * NPROCS, l * NPROCS, NPROCS))

        lN = (NPROCS + 1) * NPROCS / 2

        m = lN // NPROCS
        mr = lN % NPROCS

        if mr > 0:
            m = m + 1 if rank % 2 == 0 else m

        return (l, m)

    comm, NPROCS, rank = mpi

    # Reread structures by every process
    if NPROCS == 1:
        Sf = h5py.File(Sfn, 'r+', driver='sec2')
    else:
        Sf = h5py.File(Sfn, 'r+', driver='mpio', comm=comm)

    Gn = 'tier%d' % tier
    G = Sf.require_group(Gn)
    S = G['struct']
    # Count number of structures
    N = S.len()

    l, m = partition(N, NPROCS, rank)

    # HDF5 file
    # Table for RMSD
    RM = G.require_dataset('rmsd', (N, N), dtype=np.float32, chunks=(l, l))
    RM.attrs['chunk'] = l
    RMs = RM.id.get_space()

    # Init calculations
    tS = np.zeros((l, l), dtype=np.float32)
    ms = h5s.create_simple((l, l))

    i, j = rank, rank
    ic = S[i * l:(i + 1) * l]
    jc = ic

    for c in range(0, m):
        if rank == 0:
            tit = time.time()

        if i == j:
            calc_diag_chunk(ic, tS, cl)
        else:
            calc_chunk(ic, jc, tS, cl)

        RMs.select_hyperslab((i * l, j * l), (l, l))
        RM.id.write(ms, RMs, tS)

        if rank == 0:
            teit = time.time()
            if verbose:
                print("Step %d of %d T %s" % (c, m, teit - tit))

        # Dark magic of task assingment

        if 0 < (rank - c):
            j = j - 1
            jc = S[j * l:(j + 1) * l]
        elif rank - c == 0:
            i = NPROCS - rank - 1
            ic = S[i * l:(i + 1) * l]
        else:
            j = j + 1
            jc = S[j * l:(j + 1) * l]

    # Wait for all processes
    comm.Barrier()

    # Cleanup
    # Close matrix file
    Sf.close()
Пример #14
0
def prepare_cluster_matrix(
        Sfn,
        tier=1,
        mpi=None,
        verbose=False,
        *args, **kwargs):

    def calc_chunk(l, tRM, tCM):
        ttCM = tRM + tCM * random_state.randn(l, l)
        return ttCM

    def calc_chunk_diag(l, tRM, tCM):
        ttCM = tCM + tCM.transpose()
        ttRM = tRM + tRM.transpose()
        ttCM = calc_chunk(l, ttRM, ttCM)
        return ttCM

    comm, NPROCS, rank = mpi

    #Init RMSD matrix
    #Open matrix file in parallel mode
    if NPROCS == 1:
        Sf = h5py.File(Sfn, 'r+', driver='sec2')
    else:
        Sf = h5py.File(Sfn, 'r+', driver='mpio', comm=comm)

    Gn = 'tier%d' % tier
    G = Sf.require_group(Gn)
    #Open table with data for clusterization
    RM = G['rmsd']
    RMs = RM.id.get_space()

    N = RM.len()
    l = N // NPROCS

    if rank == 0:
        N, N1 = RM.shape

        if N != N1:
            raise ValueError(
                "S must be a square array (shape=%s)" % repr(RM.shape))

        if RM.attrs['chunk'] % l > 0:
            raise ValueError(
                "Wrong chunk size in RMSD matrix")

    CM = G.require_dataset(
        'cluster',
        (N, N),
        dtype=np.float32,
        chunks=(l, l))
    CM.attrs['chunk'] = l
    CMs = CM.id.get_space()

    random_state = np.random.RandomState(0)
    x = np.finfo(np.float32).eps
    y = np.finfo(np.float32).tiny * 100

    #Partiotioning
    lN = (NPROCS + 1) * NPROCS / 2

    m = lN // NPROCS
    mr = lN % NPROCS

    if mr > 0:
        m = m + 1 if rank % 2 == 0 else m

    #Init calculations
    tRM = np.zeros((l, l), dtype=np.float32)
    tCM = np.zeros((l, l), dtype=np.float32)
    ttCM = np.zeros((l, l), dtype=np.float32)
    ms = h5s.create_simple((l, l))

    i, j = rank, rank

    for c in range(m):
        if rank == 0:
            tit = time.time()
        RMs.select_hyperslab((i * l, j * l), (l, l))
        RM.id.read(ms, RMs, tRM)

        #tRM = -1 * tRM ** 2
        tRM **= 2
        tRM *= -1
        tCM = tRM * x + y

        if i == j:
            ttCM = calc_chunk_diag(l, tRM[:], tCM[:])
            CMs.select_hyperslab((i * l, j * l), (l, l))
            CM.id.write(ms, CMs, ttCM)

        else:
            ttCM = calc_chunk(l, tRM[:], tCM[:])
            CMs.select_hyperslab((i * l, j * l), (l, l))
            CM.id.write(ms, CMs, ttCM)

            ttCM = calc_chunk(l, tRM.transpose(), tCM.transpose())
            CMs.select_hyperslab((j * l, i * l), (l, l))
            CM.id.write(ms, CMs, ttCM)

        if rank == 0:
            teit = time.time()
            if verbose:
                print "Step %d of %d T %s" % (c, m, teit - tit)

        if (rank - c) > 0:
            j = j - 1
        elif (rank - c) == 0:
            i = NPROCS - rank - 1
        else:
            j = j + 1

    #Wait for all processes
    comm.Barrier()

    Sf.close()
Пример #15
0
        print 'Dataset fits memory'
        comm.Abort()

P = comm.bcast(P)


N = P.N
l = P.l
ll = P.ll

tb, te = task(rank, l)
disk = P.disk

damping = P.damping

ms_l = h5s.create_simple((N,))
tSl = np.ndarray((N,), dtype=np.float)

ms = h5s.create_simple((ll, N))
tS = np.ndarray((ll, N), dtype=np.float)
tdS = np.ndarray((1,), dtype=np.float)


TMLf = h5py.File(P.TMfn + '_' + str(rank) + '.hdf5', 'w')

S = TMLf.create_dataset(
    'S', (l, N), dtype=np.float)
Ss = S.id.get_space()

#Copy input data and
#place preference on diagonal
Пример #16
0
N, N1 = CM.shape

if N != N1:
    raise ValueError("S must be a square array (shape=%s)" % repr(CM.shape))

if l <= 0:
    raise ValueError("Wrong chunk size in RMSD matrix")

#Init calculations
#med = livestats.LiveStats()
med = lvc_double.Quantile(0.5)
madd = np.vectorize(med.add)

tCM = np.zeros((N, ), dtype=np.float)
ms = h5s.create_simple((N, ))

c = 0
for i in xrange(N):
    #for j in xrange(m):
    #print 'Processing chunk %d of %d' % (c, m2)
    #CMs.select_hyperslab((i * l, j * l), (l, l))
    CMs.select_hyperslab((i, 0), (1, N))
    CM.id.read(ms, CMs, tCM)
    med.add(tCM)
    #madd(tCM)
    #       for x in np.nditer(tCM):
    #           med.add(x)
    c += 1

#level, median = med.quantiles()[0]
Пример #17
0
def _calc_ci_block(block_label, assignments_filename, kinetics_filename, istate, jstate, start_iter, stop_iter,
                   mcbs_alpha, mcbs_acalpha, mcbs_nsets, extrapolate):
    log.debug('istate={} jstate={} start_iter={} stop_iter={}'.format(istate,jstate,start_iter,stop_iter))
    assignments_file = h5py.File(assignments_filename, 'r')
    kinetics_file = h5py.File(kinetics_filename, 'r')
    
    nstates, nbins = assignments_file.attrs['nstates'], assignments_file.attrs['nbins']        
    niters = stop_iter - start_iter
    
    # Fluxes and populations are averaged as they are read, as these are generally
    # very large datasets
    avg_fluxes = numpy.zeros((nstates,nstates,nbins,nbins), weight_dtype)
    avg_pops = numpy.zeros((nstates,nbins), weight_dtype)
    
    # Per-iteration macrostate-macrostate fluxes, for correlation calculation
    macro_fluxes = numpy.empty((niters, nstates, nstates), weight_dtype)
    
    # Source datasets
    pops_ds = assignments_file['labeled_populations']
    fluxes_ds = kinetics_file['labeled_bin_fluxes']
    pops_iter_start = pops_ds.attrs.get('iter_start',1)
    fluxes_iter_start = fluxes_ds.attrs.get('iter_start',1)
    
    # prepend 1 so that rank of dest == rank of src
    labeled_fluxes = numpy.empty((1,nstates,nstates,nbins,nbins), weight_dtype)
    labeled_pops = numpy.empty((1,nstates,nbins), weight_dtype)
    
    lflux_memsel = h5s.create_simple(labeled_fluxes.shape, (h5s.UNLIMITED,)*labeled_fluxes.ndim)
    lpop_memsel  = h5s.create_simple(labeled_pops.shape, (h5s.UNLIMITED,)*labeled_pops.ndim)

    fluxes_dsid = fluxes_ds.id
    pops_dsid = pops_ds.id
    
    lflux_filesel = fluxes_dsid.get_space()
    lpop_filesel  = pops_dsid.get_space()
    
    
    # Overall average
    for iiter, n_iter in enumerate(xrange(start_iter, stop_iter)):
        lflux_filesel.select_hyperslab((n_iter-fluxes_iter_start,0,0,0,0), (1,nstates,nstates,nbins,nbins),
                                       op=h5s.SELECT_SET)
        lpop_filesel.select_hyperslab((n_iter-pops_iter_start,0,0), (1,nstates,nbins),
                                      op=h5s.SELECT_SET)                    
        fluxes_dsid.read(lflux_memsel, lflux_filesel, labeled_fluxes)
        pops_dsid.read(lpop_memsel, lpop_filesel, labeled_pops)
        avg_fluxes += labeled_fluxes[0]
        avg_pops += labeled_pops[0]        
        macro_fluxes[iiter] = labeled_fluxes[0].sum(axis=3).sum(axis=2)
        
    avg_fluxes /= niters
    avg_pops /= niters     
    avg_rates = labeled_flux_to_rate(avg_fluxes, avg_pops)
    ss, macro_rates = get_macrostate_rates(avg_rates, avg_pops, extrapolate)
    overall_avg_rates = macro_rates.copy()
    ctime = mcbs_correltime(macro_fluxes[istate, jstate], mcbs_acalpha, mcbs_nsets)


    # bootstrap
    lbi = int(math.floor(mcbs_nsets*mcbs_alpha/2.0))
    ubi = int(math.ceil(mcbs_nsets*(1-mcbs_alpha/2.0)))        
    stride = ctime + 1
    synth_rates = numpy.empty((mcbs_nsets,), weight_dtype)
    
    starts = numpy.arange(start_iter, stop_iter, stride, dtype=numpy.uintc)
    stops = numpy.arange(start_iter+stride, stop_iter+stride, stride, dtype=numpy.uintc)
    nblocks = len(starts)
    if stops[-1] > stop_iter: stops[-1] = stop_iter    
    
    for iset in xrange(mcbs_nsets):
        avg_fluxes.fill(0)
        avg_pops.fill(0)
        iters_averaged = 0
        log.debug('iset={} istate={} jstate={}'.format(iset,istate,jstate))
        
        for _block in xrange(nblocks):
            iblock = random.randint(0,nblocks-1)
            for n_iter in xrange(starts[iblock], stops[iblock]):
                iters_averaged += 1

                lflux_filesel.select_hyperslab((n_iter-fluxes_iter_start,0,0,0,0), (1,nstates,nstates,nbins,nbins),
                                               op=h5s.SELECT_SET)
                lpop_filesel.select_hyperslab((n_iter-pops_iter_start,0,0), (1,nstates,nbins),
                                              op=h5s.SELECT_SET)                    
                fluxes_dsid.read(lflux_memsel, lflux_filesel, labeled_fluxes)
                pops_dsid.read(lpop_memsel, lpop_filesel, labeled_pops)
                avg_fluxes += labeled_fluxes[0]
                avg_pops += labeled_pops[0]
        
        avg_fluxes /= iters_averaged
        avg_pops /= iters_averaged
        avg_rates = labeled_flux_to_rate(avg_fluxes, avg_pops)
        ss, macro_rates = get_macrostate_rates(avg_rates, avg_pops, extrapolate)
        synth_rates[iset] = macro_rates[istate, jstate]
    synth_rates.sort()
                
    return (block_label, istate, jstate,
            (start_iter, stop_iter, overall_avg_rates[istate, jstate], synth_rates[lbi], synth_rates[ubi], ctime))    
Пример #18
0
    t = parse(pdb_list[0])
    na = t.shape[0]
na = comm.bcast(na)

#Init storage for matrices
Sfn = 'aff_struct.hdf5'
#HDF5 file
Sf = h5py.File(Sfn, 'w', driver='mpio', comm=comm)
Sf.atomic = True
#Table for RMSD
S = Sf.create_dataset('struct', (N, na, nc),
                      dtype=np.float,
                      chunks=(1, na, nc))
Ss = S.id.get_space()
tS = np.ndarray((l, na, nc), dtype=np.float32)
ms = h5s.create_simple((l, na, nc))
for i in xrange(tb, te):
    try:
        print 'Parsing %s' % pdb_list[i]
        tS[i - tb] = parse(pdb_list[i])
        print 'Parsed %s' % pdb_list[i]
    except:
        raise ValueError('Broken structure %s' % pdb_list[i])

Ss.select_hyperslab((tb, 0, 0), (l, na, nc))
S.id.write(ms, Ss, tS)

#Wait for all processes
comm.Barrier()

Sf.close()
Пример #19
0
def create_attribute(_id, _name, _dims, _value):
  """
  Writes a HDF5 string attribute, ASCII, NULLTERM
 
  _id should be something like dset.id
 
  _dims should be a list.  For a scalar, use an empty list []
 
  """
 
# Make sure we don't have a unicode name
  _name=str_to_h5(_name)

# This routine for string attributes
  _dtype = h5t.FORTRAN_S1
# Create a scalar space (if dims len=0); otherwise a simple space
  if len(_dims) == 0:
    _sid=h5s.create(h5s.SCALAR)
  elif len(_dims) == 1 and _dims[0] == 0 :
    _sid=h5s.create(h5s.SCALAR)
  else:
    _sid=h5s.create_simple(tuple(_dims))
# endif
 
# Create the memory & file datatypes. Adjust if datatype is string.
  _mdtype = _dtype.copy()
  _fdtype = _dtype.copy()
  _classtype = _dtype.get_class()
  if _classtype == h5t.STRING:
    if isinstance(_value, list):
      _strlen=0
      for _part in _value: _strlen=max(_strlen, len(_part))
    else:
      _strlen = len(_value)
#   endif
    if _strlen < 1: return None
    _mdtype.set_size(_strlen)
    _mdtype.set_strpad(h5t.STR_SPACEPAD)
    _fdtype.set_size(_strlen+1)
    _fdtype.set_strpad(h5t.STR_NULLTERM)
# endif
 
## Either add or replace the attribute
#  if h5a.exists(_id, _name):
#    _aid = h5a.open(_id, name=_name)
#  else:
#    _aid=h5a.create(_id, _name, _fdtype, _sid)
# endif
# Either add or replace the attribute
  if h5a.exists(_id, _name):
    _aid = h5a.delete(_id, name=_name)
# endif
  _aid=h5a.create(_id, _name, _fdtype, _sid)

  if _classtype == h5t.STRING:
    if isinstance(_value, list):
      _value = np.array(_value, dtype=np.string_)
    else:
      _value = np.array(str_to_h5(_value))
#   endif
  else:
    _pytype = _fdtype.dtype
    _value = np.array(_value, dtype=_pytype)
# endif
  _aid.write(_value)
  return _aid
Пример #20
0
def aff_cluster(Sfn,
                conv_iter=15,
                max_iter=2000,
                damping=0.95,
                mpi=None,
                verbose=False,
                debug=False,
                *args,
                **kwargs):

    comm, NPROCS, rank = mpi

    NPROCS_LOCAL = int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])

    #Init storage for matrices
    #Get file name
    #Open matrix file in parallel mode
    SSf = h5py.File(Sfn, 'r+', driver='mpio', comm=comm)
    SSf.atomic = True
    #Open table with data for clusterization
    SS = SSf['cluster']
    SSs = SS.id.get_space()

    params = {
        'N': 0,
        'l': 0,
        'll': 0,
        'TMfn': '',
        'disk': False,
        'preference': 0.0
    }

    P = Bunch(params)

    ft = np.float32

    if rank == 0:

        N, N1 = SS.shape

        if N != N1:
            raise ValueError("S must be a square array \
                (shape=%s)" % repr((N, N1)))
        else:
            P.N = N

        try:
            preference = SS.attrs['preference']
        except:
            raise ValueError('Unable to get preference from cluster matrix')

        if max_iter < 0:
            raise ValueError('max_iter must be > 0')

        if not 0 < conv_iter < max_iter:
            raise ValueError('conv_iter must lie in \
                interval between 0 and max_iter')

        if damping < 0.5 or damping >= 1:
            raise ValueError('damping must lie in interval between 0.5 and 1')

        print '#' * 10, 'Main params', '#' * 10
        print 'preference: %.3f' % preference
        print 'damping: %.3f' % damping
        print 'conv_iter: %d' % conv_iter
        print 'max_iter: %d' % max_iter
        print '#' * 31

        P.TMbfn = str(uuid.uuid1())
        P.TMfn = P.TMbfn + '.hdf5'

        # Magic 4 to fit MPI.Gather
        r = N % (NPROCS * 4)
        N -= r
        l = N // NPROCS
        if r > 0:
            print 'Truncating matrix to %sx%s to fit on %d procs' \
                % (N, N, NPROCS)
        P.N = N

        # Fit to memory
        MEM = psutil.virtual_memory().available / NPROCS_LOCAL
        # MEM = 500 * 10 ** 6
        ts = np.dtype(ft).itemsize * N  # Python give bits
        ts *= 8 * 1.1  # Allocate memory for e, tE, and ...
        # MEM -= ts  # ----
        tl = int(MEM // ts)  # Allocate memory for tS, tA, tR....

        def adjust_cache(tl, l):
            while float(l) % float(tl) > 0:
                tl -= 1
            return tl

        if tl < l:
            P.disk = True
            try:
                cache = 0
                #                cache = int(sys.argv[1])
                #                print sys.argv[1]
                assert cache < l
            except:
                cache = tl
                #print 'Wrong cache settings, set cache to %d' % tl
            tl = adjust_cache(tl, l)
            P.l = l
            P.ll = tl
        else:
            P.l = l
            P.ll = l

        if verbose:
            print "Available memory per process: %.2fG" % (MEM / 10.0**9)
            print "Memory per row: %.2fM" % (ts / 10.0**6)
            print "Estimated memory per process: %.2fG" \
                % (ts * P.ll / 10.0 ** 9)
            print 'Cache size is %d of %d' % (P.ll, P.l)

    P = comm.bcast(P)

    N = P.N
    l = P.l
    ll = P.ll

    ms = h5s.create_simple((ll, N))
    ms_l = h5s.create_simple((N, ))

    tb, te = task(N, NPROCS, rank)

    tS = np.ndarray((ll, N), dtype=ft)
    tSl = np.ndarray((N, ), dtype=ft)

    disk = P.disk

    if disk is True:
        TMLfd = tempfile.mkdtemp()
        TMLfn = osp(TMLfd, P.TMbfn + '_' + str(rank) + '.hdf5')
        TMLf = h5py.File(TMLfn, 'w')
        TMLf.atomic = True

        S = TMLf.create_dataset('S', (l, N), dtype=ft)
        Ss = S.id.get_space()

    #Copy input data and
    #place preference on diagonal
    z = -np.finfo(ft).max

    for i in range(tb, te, ll):
        SSs.select_hyperslab((i, 0), (ll, N))
        SS.id.read(ms, SSs, tS)

        if disk is True:
            Ss.select_hyperslab((i - tb, 0), (ll, N))
            S.id.write(ms, Ss, tS)

    if disk is True:
        R = TMLf.create_dataset('R', (l, N), dtype=ft)
        Rs = R.id.get_space()

    tRold = np.zeros((ll, N), dtype=ft)
    tR = np.zeros((ll, N), dtype=ft)
    tdR = np.zeros((l, ), dtype=ft)

    #Shared storage
    TMf = h5py.File(P.TMfn, 'w', driver='mpio', comm=comm)
    TMf.atomic = True

    Rp = TMf.create_dataset('Rp', (N, N), dtype=ft)
    Rps = Rp.id.get_space()

    tRp = np.ndarray((ll, N), dtype=ft)
    tRpa = np.ndarray((N, ll), dtype=ft)

    A = TMf.create_dataset('A', (N, N), dtype=ft)
    As = A.id.get_space()

    tAS = np.ndarray((ll, N), dtype=ft)
    tAold = np.ndarray((N, ll), dtype=ft)
    tA = np.ndarray((N, ll), dtype=ft)
    tdA = np.ndarray((l, ), dtype=ft)

    e = np.ndarray((N, conv_iter), dtype=np.int8)
    tE = np.ndarray((N, ), dtype=np.int8)
    ttE = np.ndarray((l, ), dtype=np.int8)

    converged = False
    cK = 0
    K = 0
    ind = np.arange(ll)

    for it in range(max_iter):
        if rank == 0:
            if verbose is True:
                print '=' * 10 + 'It %d' % (it) + '=' * 10
                tit = time.time()
        # Compute responsibilities
        for i in range(tb, te, ll):
            if disk is True:
                il = i - tb
                Ss.select_hyperslab((il, 0), (ll, N))
                S.id.read(ms, Ss, tS)
                #tS = S[i, :]
                Rs.select_hyperslab((il, 0), (ll, N))
                R.id.read(ms, Rs, tRold)
            else:
                tRold = tR.copy()

            As.select_hyperslab((i, 0), (ll, N))
            A.id.read(ms, As, tAS)
            #Tas = a[I, :]
            tAS += tS
            #tRold = R[i, :]

            tI = bn.nanargmax(tAS, axis=1)
            tY = tAS[ind, tI]
            tAS[ind, tI[ind]] = z
            tY2 = bn.nanmax(tAS, axis=1)

            tR = tS - tY[:, np.newaxis]
            tR[ind, tI[ind]] = tS[ind, tI[ind]] - tY2[ind]
            tR = (1 - damping) * tR + damping * tRold

            tRp = np.maximum(tR, 0)

            for il in range(ll):
                tRp[il, i + il] = tR[il, i + il]
                tdR[i - tb + il] = tR[il, i + il]

            if disk is True:
                R.id.write(ms, Rs, tR)
                #R[i, :] = tR

            Rps.select_hyperslab((i, 0), (ll, N))
            Rp.id.write(ms, Rps, tRp)

            #Rp[i, :] = tRp
        if rank == 0:
            if verbose is True:
                teit1 = time.time()
                print 'R T %s' % (teit1 - tit)

        comm.Barrier()

        # Compute availabilities
        for j in range(tb, te, ll):

            As.select_hyperslab((0, j), (N, ll))

            if disk is True:
                A.id.read(ms, As, tAold)
            else:
                tAold = tA.copy()

            Rps.select_hyperslab((0, j), (N, ll))
            Rp.id.read(ms, Rps, tRpa)
            #tRp = Rp[:, j]

            tA = bn.nansum(tRpa, axis=0)[np.newaxis, :] - tRpa
            for jl in range(ll):
                tdA[j - tb + jl] = tA[j + jl, jl]

            tA = np.minimum(tA, 0)

            for jl in range(ll):
                tA[j + jl, jl] = tdA[j - tb + jl]

            tA *= (1 - damping)
            tA += damping * tAold

            for jl in range(ll):
                tdA[j - tb + jl] = tA[j + jl, jl]

            A.id.write(ms, As, tA)

        if rank == 0:
            if verbose is True:
                teit2 = time.time()
                print 'A T %s' % (teit2 - teit1)

        ttE = np.array(((tdA + tdR) > 0), dtype=np.int8)

        if NPROCS > 1:
            comm.Gather([ttE, MPI.INT], [tE, MPI.INT])
            comm.Bcast([tE, MPI.INT])
        else:
            tE = ttE
        e[:, it % conv_iter] = tE
        pK = K
        K = bn.nansum(tE)

        if rank == 0:
            if verbose is True:
                teit = time.time()
                cc = ''
                if K == pK:
                    if cK == 0:
                        cK += 1
                    elif cK > 1:
                        cc = ' Conv %d of %d' % (cK, conv_iter)
                else:
                    cK = 0

                print 'Total K %d T %s%s' % (K, teit - tit, cc)

        if it >= conv_iter:

            if rank == 0:
                se = bn.nansum(e, axis=1)
                converged = (bn.nansum((se == conv_iter) + (se == 0)) == N)

                if (converged == np.bool_(True)) and (K > 0):
                    if verbose is True:
                        print("Converged after %d iterations." % (it))
                    converged = True
                else:
                    converged = False

            converged = comm.bcast(converged, root=0)

        if converged is True:
            break

    if not converged and verbose and rank == 0:
        print("Failed to converge after %d iterations." % (max_iter))

    if K > 0:

        I = np.nonzero(e[:, 0])[0]
        C = np.zeros((N, ), dtype=np.int)
        tC = np.zeros((l, ), dtype=np.int)

        for i in range(l):
            if disk is True:
                Ss.select_hyperslab((i, 0), (1, N))
                S.id.read(ms_l, Ss, tSl)
            else:
                tSl = tS[i]

            tC[i] = bn.nanargmax(tSl[I])

        comm.Gather([tC, MPI.INT], [C, MPI.INT])

        if rank == 0:
            C[I] = np.arange(K)

        comm.Bcast([C, MPI.INT])

        for k in range(K):
            ii = np.where(C == k)[0]
            tN = ii.shape[0]

            tI = np.zeros((tN, ), dtype=np.float32)
            ttI = np.zeros((tN, ), dtype=np.float32)
            tttI = np.zeros((tN, ), dtype=np.float32)
            ms_k = h5s.create_simple((tN, ))

            j = rank
            while j < tN:
                ind = [(ii[i], ii[j]) for i in range(tN)]
                SSs.select_elements(ind)
                SS.id.read(ms_k, SSs, tttI)

                ttI[j] = bn.nansum(tttI)
                j += NPROCS

            comm.Reduce([ttI, MPI.FLOAT], [tI, MPI.FLOAT])

            if rank == 0:
                I[k] = ii[bn.nanargmax(tI)]

        I.sort()
        comm.Bcast([I, MPI.INT])

        for i in range(l):
            if disk is True:
                Ss.select_hyperslab((i, 0), (1, N))
                S.id.read(ms_l, Ss, tSl)
            else:
                tSl = tS[i]

            tC[i] = bn.nanargmax(tSl[I])

        comm.Gather([tC, MPI.INT], [C, MPI.INT])

        if rank == 0:
            C[I] = np.arange(K)

    else:
        if rank == 0:
            I = np.zeros(())
            C = np.zeros(())

    #Cleanup
    SSf.close()
    TMf.close()

    if disk is True:
        TMLf.close()
        shutil.rmtree(TMLfd)

    comm.Barrier()

    if rank == 0:

        os.remove(P.TMfn)

        if verbose:
            print 'APN: %d' % K

        if I.size and C.size:

            Sf = h5py.File(Sfn, 'r+', driver='sec2')

            if 'aff_labels' in Sf.keys():
                del Sf['aff_labels']

            LM = Sf.require_dataset('aff_labels', shape=C.shape, dtype=np.int)
            LM[:] = C[:]

            if 'aff_centers' in Sf.keys():
                del Sf['aff_centers']

            CM = Sf.require_dataset('aff_centers', shape=I.shape, dtype=np.int)
            CM[:] = I[:]
            Sf.close()
Пример #21
0
    r = N - l * NPROCS
    if r != 0:
        l = l
        N = N - r
        print 'Truncating matrix to NxN to fit on %d procs' % NPROCS

    med = livestats.LiveStats()
    madd = np.vectorize(med.add)

N = comm.bcast(N, root=0)
l = comm.bcast(l, root=0)

CMs = CM.id.get_space()
tCM = np.empty((N, ), dtype=np.float)

ms = h5s.create_simple((N, ))

tb, te = task(NPROCS - 1 - rank, l)

if rank == 0:
    te -= 1

# Remove degeneracies
for i in xrange(tb, te):

    CMs.select_hyperslab((i, 0), (1, N))
    CM.id.read(ms, CMs, tCM)

    if rank != 0:
        comm.Send([tCM, MPI.FLOAT], dest=0)
Пример #22
0
def load_pdb_coords(Sfn,
                    pdb_list,
                    topology=None,
                    mpi=None,
                    verbose=False,
                    *args,
                    **kwargs):
    def check_pbc(coords, threshold=50):
        for i in range(len(coords) - 1):
            assert np.linalg.norm(coords[i] - coords[i + 1]) < threshold

    def parse_pdb(i):
        """Parse PDB files"""
        ps = prody.parsePDB(i)
        pc = ps.getCoords()
        check_pbc(pc)
        return pc

    @master
    def estimate_pdb_numatoms(topology):

        pdb_t = parse_pdb(topology)

        return pdb_t.shape

    @master
    def estimate_coord_shape(ftype='pdb',
                             pdb_list=None,
                             topology=None,
                             NPROCS=1):

        N = len(pdb_list)
        r = N % NPROCS

        if r > 0:
            N = N - r
            print 'Truncating number to %d to fit %s procs' % (N, NPROCS)

        if ftype == 'pdb':
            if not topology:
                topology = pdb_list[0]
            na, nc = estimate_pdb_numatoms(topology)

        shape = (N, na, nc)

        return shape

    @master
    def load_pdb_names(Sfn, pdb_list, topology=None):
        N = len(pdb_list)

        Sf = h5py.File(Sfn, 'r+', driver='sec2')
        Sf.atomic = True

        vls = h5py.special_dtype(vlen=str)
        L = Sf.create_dataset('labels', (N, ), dtype=vls)

        L[:] = pdb_list[:]

        if not topology:
            topology = pdb_list[0]

        L.attrs['topology'] = topology

        Sf.close()

    comm, NPROCS, rank = mpi

    if len(pdb_list) == 1:
        ptrn = pdb_list[0]
        if '*' in ptrn or '?' in ptrn:
            pdb_list = glob.glob(ptrn)

    shape = estimate_coord_shape(pdb_list=pdb_list, topology=topology)
    shape = comm.bcast(shape)
    N = shape[0]
    chunk = (1, ) + shape[1:]

    #Init storage for matrices
    #HDF5 file
    Sf = h5py.File(Sfn, 'w', driver='mpio', comm=comm)
    Sf.atomic = True
    #Table for RMSD
    S = Sf.create_dataset('struct', shape, dtype=np.float, chunks=chunk)

    # A little bit of dark magic for faster io
    Ss = S.id.get_space()
    tS = np.ndarray(chunk, dtype=np.float)
    ms = h5s.create_simple(chunk)

    tb, te = task(N, NPROCS, rank)

    for i in range(tb, te):
        try:
            tS = parse_pdb(pdb_list[i])
            if verbose:
                print 'Parsed %s' % pdb_list[i]
        except:
            raise ValueError('Broken structure %s' % pdb_list[i])

        Ss.select_hyperslab((i, 0, 0), chunk)
        S.id.write(ms, Ss, tS)

    #Wait for all processes
    comm.Barrier()

    Sf.close()

    load_pdb_names(Sfn, pdb_list[:N])
Пример #23
0
def make_new_dset(parent, shape=None, dtype=None, data=None,
                 chunks=None, compression=None, shuffle=None,
                    fletcher32=None, maxshape=None, compression_opts=None,
                  fillvalue=None, scaleoffset=None, track_times=None):
    """ Return a new low-level dataset identifier

    Only creates anonymous datasets.
    """

    # Convert data to a C-contiguous ndarray
    if data is not None:
        import base
        data = numpy.asarray(data, order="C", dtype=base.guess_dtype(data))

    # Validate shape
    if shape is None:
        if data is None:
            raise TypeError("Either data or shape must be specified")
        shape = data.shape
    else:
        shape = tuple(shape)
        if data is not None and (numpy.product(shape) != numpy.product(data.shape)):
            raise ValueError("Shape tuple is incompatible with data")

    tmp_shape = maxshape if maxshape is not None else shape
    # Validate chunk shape
    if isinstance(chunks, tuple) and (-numpy.array([ i>=j for i,j in zip(tmp_shape,chunks) if i is not None])).any():
        errmsg = "Chunk shape must not be greater than data shape in any dimension. "\
                 "{} is not compatible with {}".format(chunks, shape)
        raise ValueError(errmsg)

    if isinstance(dtype, h5py.Datatype):
        # Named types are used as-is
        tid = dtype.id
        dtype = tid.dtype  # Following code needs this
    else:
        # Validate dtype
        if dtype is None and data is None:
            dtype = numpy.dtype("=f4")
        elif dtype is None and data is not None:
            dtype = data.dtype
        else:
            dtype = numpy.dtype(dtype)
        tid = h5t.py_create(dtype, logical=1)

    # Legacy
    if any((compression, shuffle, fletcher32, maxshape,scaleoffset)) and chunks is False:
        raise ValueError("Chunked format required for given storage options")

    # Legacy
    if compression is True:
        if compression_opts is None:
            compression_opts = 4
        compression = 'gzip'

    # Legacy
    if compression in _LEGACY_GZIP_COMPRESSION_VALS:
        if compression_opts is not None:
            raise TypeError("Conflict in compression options")
        compression_opts = compression
        compression = 'gzip'

    dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts,
                  shuffle, fletcher32, maxshape, scaleoffset)

    if fillvalue is not None:
        fillvalue = numpy.array(fillvalue)
        dcpl.set_fill_value(fillvalue)

    if track_times in (True, False):
        dcpl.set_obj_track_times(track_times)
    elif track_times is not None:
        raise TypeError("track_times must be either True or False")

    if maxshape is not None:
        maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
    sid = h5s.create_simple(shape, maxshape)


    dset_id = h5d.create(parent.id, None, tid, sid, dcpl=dcpl)

    if data is not None:
        dset_id.write(h5s.ALL, h5s.ALL, data)

    return dset_id
Пример #24
0
def make_new_dset(parent,
                  shape=None,
                  dtype=None,
                  data=None,
                  chunks=None,
                  compression=None,
                  shuffle=None,
                  fletcher32=None,
                  maxshape=None,
                  compression_opts=None,
                  fillvalue=None,
                  scaleoffset=None,
                  track_times=None):
    """ Return a new low-level dataset identifier

    Only creates anonymous datasets.
    """

    # Convert data to a C-contiguous ndarray
    if data is not None:
        import base
        data = numpy.asarray(data, order="C", dtype=base.guess_dtype(data))

    # Validate shape
    if shape is None:
        if data is None:
            raise TypeError("Either data or shape must be specified")
        shape = data.shape
    else:
        shape = tuple(shape)
        if data is not None and (numpy.product(shape) != numpy.product(
                data.shape)):
            raise ValueError("Shape tuple is incompatible with data")

    tmp_shape = maxshape if maxshape is not None else shape
    # Validate chunk shape
    if isinstance(chunks, tuple) and (-numpy.array(
        [i >= j for i, j in zip(tmp_shape, chunks) if i is not None])).any():
        errmsg = "Chunk shape must not be greater than data shape in any dimension. "\
                 "{} is not compatible with {}".format(chunks, shape)
        raise ValueError(errmsg)

    if isinstance(dtype, h5py.Datatype):
        # Named types are used as-is
        tid = dtype.id
        dtype = tid.dtype  # Following code needs this
    else:
        # Validate dtype
        if dtype is None and data is None:
            dtype = numpy.dtype("=f4")
        elif dtype is None and data is not None:
            dtype = data.dtype
        else:
            dtype = numpy.dtype(dtype)
        tid = h5t.py_create(dtype, logical=1)

    # Legacy
    if any((compression, shuffle, fletcher32, maxshape,
            scaleoffset)) and chunks is False:
        raise ValueError("Chunked format required for given storage options")

    # Legacy
    if compression is True:
        if compression_opts is None:
            compression_opts = 4
        compression = 'gzip'

    # Legacy
    if compression in _LEGACY_GZIP_COMPRESSION_VALS:
        if compression_opts is not None:
            raise TypeError("Conflict in compression options")
        compression_opts = compression
        compression = 'gzip'

    dcpl = filters.generate_dcpl(shape, dtype, chunks, compression,
                                 compression_opts, shuffle, fletcher32,
                                 maxshape, scaleoffset)

    if fillvalue is not None:
        fillvalue = numpy.array(fillvalue)
        dcpl.set_fill_value(fillvalue)

    if track_times in (True, False):
        dcpl.set_obj_track_times(track_times)
    elif track_times is not None:
        raise TypeError("track_times must be either True or False")

    if maxshape is not None:
        maxshape = tuple(m if m is not None else h5s.UNLIMITED
                         for m in maxshape)
    sid = h5s.create_simple(shape, maxshape)

    dset_id = h5d.create(parent.id, None, tid, sid, dcpl=dcpl)

    if data is not None:
        dset_id.write(h5s.ALL, h5s.ALL, data)

    return dset_id
Пример #25
0
def load_pdb_coords(Sfn,
                    pdb_list,
                    tier=1,
                    topology=None,
                    pbc=True,
                    threshold=10.0,
                    mpi=None,
                    verbose=False,
                    selection='all',
                    *args,
                    **kwargs):
    def check_pbc(coords, threshold=10.0, selection='all'):
        for i in range(len(coords) - 1):
            assert np.linalg.norm(coords[i] - coords[i + 1]) < threshold

    def parse_pdb(i, pbc=True, threshold=10.0, selection='all'):
        """Parse PDB files"""
        ps = prody.parsePDB(i)
        pc_ = ps.select(selection)
        if pc_ is None:
            raise ValueError('Empty selection "%s"' % selection)

        pc = pc_.getCoords()
        if pbc:
            check_pbc(pc, threshold)
        return pc

    def estimate_pdb_numatoms(topology,
                              pbc=True,
                              threshold=10.0,
                              selection='all'):

        pdb_t = parse_pdb(topology,
                          pbc=pbc,
                          threshold=threshold,
                          selection=selection)

        return pdb_t.shape

    def estimate_coord_shape(
        ftype='pdb',
        pdb_list=None,
        topology=None,
        pbc=True,
        threshold=10.0,
        selection='all',
        NPROCS=1,
    ):

        N = len(pdb_list)
        r = N % NPROCS

        if r > 0:
            N = N - r
            print('Truncating number to %d to fit %s procs' % (N, NPROCS))

        if ftype == 'pdb':
            if not topology:
                topology = pdb_list[0]
            na, nc = estimate_pdb_numatoms(topology,
                                           pbc=pbc,
                                           threshold=threshold,
                                           selection=selection)

        shape = (N, na, nc)

        return shape

    def load_pdb_names(Sfn, pdb_list, topology=None, tier=1):
        N = len(pdb_list)

        Sf = h5py.File(Sfn, 'w', driver='sec2')

        vls = h5py.special_dtype(vlen=str)
        Gn = 'tier%d' % tier
        G = Sf.require_group(Gn)
        L = G.create_dataset('labels', (N, ), dtype=vls)

        L[:] = pdb_list[:]

        if not topology:
            topology = pdb_list[0]

        L.attrs['topology'] = topology

        Sf.close()

    def load_from_previous_tier(Sfn, tier):
        Sf = h5py.File(Sfn, 'r+', driver='sec2')

        PGn = 'tier%d' % (tier - 1)
        PG = Sf.require_group(PGn)

        PS = PG['struct']
        nstruct, natoms, ncoords = PS.shape
        PNL = PG['labels']

        PC = PG['aff_centers'][:]
        nstruct = PC.shape[0]

        shape = (nstruct, natoms, ncoords)
        chunk = (1, natoms, ncoords)

        Gn = 'tier%d' % tier
        G = Sf.require_group(Gn)
        S = G.require_dataset('struct', shape, dtype=np.float, chunks=chunk)

        vls = h5py.special_dtype(vlen=str)
        L = G.require_dataset('labels', (nstruct, ), dtype=vls)

        for i in range(nstruct):
            S[i] = PS[PC[i]][:]
            L[i] = PNL[PC[i]][:]

        Sf.close()

    comm, NPROCS, rank = mpi

    if tier > 1:
        if rank == 0:
            load_from_previous_tier(Sfn, tier)
            return
        else:
            return

    if len(pdb_list) == 1:
        ptrn = pdb_list[0]
        if '*' in ptrn or '?' in ptrn:
            pdb_list = glob.glob(ptrn)
            pdb_list = natsorted(pdb_list)

    shape = None

    if rank == 0:
        shape = estimate_coord_shape(pdb_list=pdb_list,
                                     topology=topology,
                                     pbc=pbc,
                                     threshold=threshold,
                                     selection=selection,
                                     NPROCS=NPROCS)

        N = shape[0]
        load_pdb_names(Sfn, pdb_list[:N], topology=topology)

    shape = comm.bcast(shape)
    N = shape[0]
    chunk = (1, ) + shape[1:]

    # Init storage for matrices
    # HDF5 file
    if NPROCS == 1:
        Sf = h5py.File(Sfn, 'r+', driver='sec2')
    else:
        Sf = h5py.File(Sfn, 'r+', driver='mpio', comm=comm)

    # Table for RMSD
    Gn = 'tier%d' % tier
    G = Sf.require_group(Gn)
    S = G.require_dataset('struct', shape, dtype=np.float, chunks=chunk)

    # A little bit of dark magic for faster io
    Ss = S.id.get_space()
    tS = np.ndarray(chunk, dtype=np.float)
    ms = h5s.create_simple(chunk)

    tb, te = task(N, NPROCS, rank)

    for i in range(tb, te):
        try:
            tS = parse_pdb(pdb_list[i],
                           pbc=pbc,
                           threshold=threshold,
                           selection=selection)

            if verbose:
                print('Parsed %s' % pdb_list[i])
        except:
            raise ValueError('Broken structure %s' % pdb_list[i])

        Ss.select_hyperslab((i, 0, 0), chunk)
        S.id.write(ms, Ss, tS)

    # Wait for all processes
    comm.Barrier()

    Sf.close()
Пример #26
0
        P.l = l
        P.ll = tl
    else:
        P.l = l
        P.ll = l

    print 'Cache size is %d of %d' % (P.ll, P.l)
    print "Estimated memory per process: %.2fG" % (ts * P.ll / 10.0 ** 9)

P = comm.bcast(P)

N = P.N
l = P.l
ll = P.ll

ms = h5s.create_simple((ll, N))
ms_l = h5s.create_simple((N,))
ms_e = h5s.create_simple((1,))


tb, te = task(rank, l)

tS = np.ndarray((ll, N), dtype=ft)
tSl = np.ndarray((N,), dtype=ft)
tdS = np.ndarray((1,), dtype=ft)

disk = P.disk

if disk is True:
    TMLfd = tempfile.mkdtemp()
    TMLfn = osp(TMLfd, P.TMfn + '_' + str(rank) + '.hdf5')
Пример #27
0
    def __getitem__(self, args, new_dtype=None):
        """ Read a slice from the HDF5 dataset.

        Takes slices and recarray-style field names (more than one is
        allowed!) in any order.  Obeys basic NumPy rules, including
        broadcasting.

        """
        # This boilerplate code is based on h5py.Dataset.__getitem__
        args = args if isinstance(args, tuple) else (args, )

        if new_dtype is None:
            new_dtype = getattr(self._local, 'astype', None)

        # Sort field names from the rest of the args.
        names = tuple(x for x in args if isinstance(x, str))

        if names:
            # Read a subset of the fields in this structured dtype
            if len(names) == 1:
                names = names[0]  # Read with simpler dtype of this field
            args = tuple(x for x in args if not isinstance(x, str))
            return self.fields(names, _prior_dtype=new_dtype)[args]

        if new_dtype is None:
            new_dtype = self.dtype
        mtype = h5t.py_create(new_dtype)

        # === Special-case region references ====

        if len(args) == 1 and isinstance(args[0], h5r.RegionReference):

            obj = h5r.dereference(args[0], self.id)
            if obj != self.id:
                raise ValueError("Region reference must point to this dataset")

            sid = h5r.get_region(args[0], self.id)
            mshape = guess_shape(sid)
            if mshape is None:
                # 0D with no data (NULL or deselected SCALAR)
                return Empty(new_dtype)
            out = np.empty(mshape, dtype=new_dtype)
            if out.size == 0:
                return out

            sid_out = h5s.create_simple(mshape)
            sid_out.select_all()
            self.id.read(sid_out, sid, out, mtype)
            return out

        # === END CODE FROM h5py.Dataset.__getitem__ ===

        idx = ndindex(args).reduce(self.shape)

        arr = np.ndarray(idx.newshape(self.shape), new_dtype, order='C')

        for c, index in as_subchunks(idx, self.shape, self.chunks):
            if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)):
                raw_idx = Tuple(self.id.data_dict[c],
                                *[slice(0, len(i)) for i in c.args[1:]]).raw
                a = self.id._read_chunk(raw_idx)
                self.id.data_dict[c] = a

            if self.id.data_dict[c].size != 0:
                arr_idx = c.as_subindex(idx)
                arr[arr_idx.raw] = self.id.data_dict[c][index.raw]

        return arr
Пример #28
0
def create_compact_dataset(loc, name, shape=None, dtype=None, data=None,
                           chunks=None, compression=None, shuffle=None,
                           fletcher32=None, maxshape=None,
                           compression_opts=None, fillvalue=None,
                           scaleoffset=None, track_times=None):
    """Create a new HDF5 dataset with a compact storage layout."""

    # Convert data to a C-contiguous ndarray
    if data is not None:
        import h5py._hl.base
        data = numpy.asarray(data, order="C", dtype=h5py._hl.base.guess_dtype(data))

    # Validate shape
    if shape is None:
        if data is None:
            raise TypeError("Either data or shape must be specified")
        shape = data.shape
    else:
        shape = tuple(shape)
        if data is not None and (numpy.product(shape) != numpy.product(data.shape)):
            raise ValueError("Shape tuple is incompatible with data")

    if isinstance(dtype, h5py.Datatype):
        # Named types are used as-is
        tid = dtype.id
        dtype = tid.dtype  # Following code needs this
    else:
        # Validate dtype
        if dtype is None and data is None:
            dtype = numpy.dtype("=f4")
        elif dtype is None and data is not None:
            dtype = data.dtype
        else:
            dtype = numpy.dtype(dtype)
        tid = h5t.py_create(dtype, logical=1)

    # Legacy
    if any((compression, shuffle, fletcher32, maxshape,scaleoffset)) and chunks is False:
        raise ValueError("Chunked format required for given storage options")

    # Legacy
    if compression is True:
        if compression_opts is None:
            compression_opts = 4
        compression = 'gzip'

    # Legacy
    if compression in range(10):
        if compression_opts is not None:
            raise TypeError("Conflict in compression options")
        compression_opts = compression
        compression = 'gzip'

    if h5py.version.version_tuple >= (2, 2, 0, ''):
        dcpl = filters.generate_dcpl(shape, dtype, chunks, compression,
                                     compression_opts, shuffle, fletcher32,
                                     maxshape, None)
    else:
        dcpl = filters.generate_dcpl(shape, dtype, chunks, compression,
                                     compression_opts, shuffle, fletcher32,
                                     maxshape)

    if fillvalue is not None:
        fillvalue = numpy.array(fillvalue)
        dcpl.set_fill_value(fillvalue)

    if track_times in (True, False):
        dcpl.set_obj_track_times(track_times)
    elif track_times is not None:
        raise TypeError("track_times must be either True or False")

    dcpl.set_layout(h5d.COMPACT)

    if maxshape is not None:
        maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
    sid = h5s.create_simple(shape, maxshape)


    dset_id = h5d.create(loc.id, None, tid, sid, dcpl=dcpl)

    if data is not None:
        dset_id.write(h5s.ALL, h5s.ALL, data)

    dset = dataset.Dataset(dset_id)
    if name is not None:
        loc[name] = dset
    return dset
Пример #29
0
def make_new_dset(parent, shape=None, dtype=None, data=None,
                 chunks=None, compression=None, shuffle=None,
                    fletcher32=None, maxshape=None, compression_opts=None,
                  fillvalue=None, scaleoffset=None, track_times=None):
    """ Return a new low-level dataset identifier

    Only creates anonymous datasets.
    """

    # Convert data to a C-contiguous ndarray
    if data is not None:
        import base
        data = numpy.asarray(data, order="C", dtype=base.guess_dtype(data))

    # Validate shape
    if shape is None:
        if data is None:
            raise TypeError("Either data or shape must be specified")
        shape = data.shape
    else:
        shape = tuple(shape)
        if data is not None and (numpy.product(shape) != numpy.product(data.shape)):
            raise ValueError("Shape tuple is incompatible with data")

    # Validate dtype
    if dtype is None and data is None:
        dtype = numpy.dtype("=f4")
    elif dtype is None and data is not None:
        dtype = data.dtype
    else:
        dtype = numpy.dtype(dtype)

    # Legacy
    if any((compression, shuffle, fletcher32, maxshape,scaleoffset)) and chunks is False:
        raise ValueError("Chunked format required for given storage options")

    # Legacy
    if compression is True:
        if compression_opts is None:
            compression_opts = 4
        compression = 'gzip'

    # Legacy
    if compression in range(10):
        if compression_opts is not None:
            raise TypeError("Conflict in compression options")
        compression_opts = compression
        compression = 'gzip'

    dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts,
                  shuffle, fletcher32, maxshape, scaleoffset)

    if fillvalue is not None:
        fillvalue = numpy.array(fillvalue)
        dcpl.set_fill_value(fillvalue)

    if track_times in (True, False):
        dcpl.set_obj_track_times(track_times)
    elif track_times is not None:
        raise TypeError("track_times must be either True or False")

    if maxshape is not None:
        maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
    sid = h5s.create_simple(shape, maxshape)
    tid = h5t.py_create(dtype, logical=1)

    dset_id = h5d.create(parent.id, None, tid, sid, dcpl=dcpl)

    if data is not None:
        dset_id.write(h5s.ALL, h5s.ALL, data)

    return dset_id
Пример #30
0
    def __setitem__(self, args, val):
        """ Write to the HDF5 dataset from a Numpy array.

        NumPy's broadcasting rules are honored, for "simple" indexing
        (slices and integers).  For advanced indexing, the shapes must
        match.
        """
        args = args if isinstance(args, tuple) else (args, )

        # Sort field indices from the slicing
        names = tuple(x for x in args if isinstance(x, str))
        args = tuple(x for x in args if not isinstance(x, str))

        # Generally we try to avoid converting the arrays on the Python
        # side.  However, for compound literals this is unavoidable.
        vlen = h5t.check_dtype(vlen=self.dtype)
        if vlen not in (bytes, unicode, None):
            try:
                val = numpy.asarray(val, dtype=vlen)
            except ValueError:
                try:
                    val = numpy.array(
                        [numpy.array(x, dtype=vlen) for x in val],
                        dtype=self.dtype)
                except ValueError:
                    pass
            if vlen == val.dtype:
                if val.ndim > 1:
                    tmp = numpy.empty(shape=val.shape[:-1], dtype=object)
                    tmp.ravel()[:] = [
                        i for i in val.reshape((numpy.product(val.shape[:-1]),
                                                val.shape[-1]))
                    ]
                else:
                    tmp = numpy.array([None], dtype=object)
                    tmp[0] = val
                val = tmp
        elif self.dtype.kind == "O" or \
          (self.dtype.kind == 'V' and \
          (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \
          (self.dtype.subdtype == None)):
            if len(names) == 1 and self.dtype.fields is not None:
                # Single field selected for write, from a non-array source
                if not names[0] in self.dtype.fields:
                    raise ValueError("No such field for indexing: %s" %
                                     names[0])
                dtype = self.dtype.fields[names[0]][0]
                cast_compound = True
            else:
                dtype = self.dtype
                cast_compound = False

            val = numpy.asarray(val, dtype=dtype, order='C')
            if cast_compound:
                val = val.astype(numpy.dtype([(names[0], dtype)]))
        else:
            val = numpy.asarray(val, order='C')

        # Check for array dtype compatibility and convert
        if self.dtype.subdtype is not None:
            shp = self.dtype.subdtype[1]
            valshp = val.shape[-len(shp):]
            if valshp != shp:  # Last dimension has to match
                raise TypeError(
                    "When writing to array types, last N dimensions have to match (got %s, but should be %s)"
                    % (
                        valshp,
                        shp,
                    ))
            mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
            mshape = val.shape[0:len(val.shape) - len(shp)]

        # Make a compound memory type if field-name slicing is required
        elif len(names) != 0:

            mshape = val.shape

            # Catch common errors
            if self.dtype.fields is None:
                raise TypeError(
                    "Illegal slicing argument (not a compound dataset)")
            mismatch = [x for x in names if x not in self.dtype.fields]
            if len(mismatch) != 0:
                mismatch = ", ".join('"%s"' % x for x in mismatch)
                raise ValueError(
                    "Illegal slicing argument (fields %s not in dataset type)"
                    % mismatch)

            # Write non-compound source into a single dataset field
            if len(names) == 1 and val.dtype.fields is None:
                subtype = h5y.py_create(val.dtype)
                mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
                mtype.insert(self._e(names[0]), 0, subtype)

            # Make a new source type keeping only the requested fields
            else:
                fieldnames = [x for x in val.dtype.names
                              if x in names]  # Keep source order
                mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
                for fieldname in fieldnames:
                    subtype = h5t.py_create(val.dtype.fields[fieldname][0])
                    offset = val.dtype.fields[fieldname][1]
                    mtype.insert(self._e(fieldname), offset, subtype)

        # Use mtype derived from array (let DatasetID.write figure it out)
        else:
            mshape = val.shape
            mtype = None

        # Perform the dataspace selection
        selection = sel.select(self.shape, args, dsid=self.id)

        if selection.nselect == 0:
            return

        # Broadcast scalars if necessary.
        if (mshape == () and selection.mshape != ()):
            if self.dtype.subdtype is not None:
                raise TypeError(
                    "Scalar broadcasting is not supported for array dtypes")
            val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype)
            val2[...] = val
            val = val2
            mshape = val.shape

        # Perform the write, with broadcasting
        # Be careful to pad memory shape with ones to avoid HDF5 chunking
        # glitch, which kicks in for mismatched memory/file selections
        if (len(mshape) < len(self.shape)):
            mshape_pad = (1, ) * (len(self.shape) - len(mshape)) + mshape
        else:
            mshape_pad = mshape
        mspace = h5s.create_simple(mshape_pad,
                                   (h5s.UNLIMITED, ) * len(mshape_pad))
        for fspace in selection.broadcast(mshape):
            self.id.write(mspace, fspace, val, mtype)
Пример #31
0
def _calc_ci_block(block_label, assignments_filename, kinetics_filename,
                   istate, jstate, start_iter, stop_iter, mcbs_alpha,
                   mcbs_acalpha, mcbs_nsets, extrapolate):
    log.debug('istate={} jstate={} start_iter={} stop_iter={}'.format(
        istate, jstate, start_iter, stop_iter))
    assignments_file = h5py.File(assignments_filename, 'r')
    kinetics_file = h5py.File(kinetics_filename, 'r')

    nstates, nbins = assignments_file.attrs['nstates'], assignments_file.attrs[
        'nbins']
    niters = stop_iter - start_iter

    # Fluxes and populations are averaged as they are read, as these are generally
    # very large datasets
    avg_fluxes = numpy.zeros((nstates, nstates, nbins, nbins), weight_dtype)
    avg_pops = numpy.zeros((nstates, nbins), weight_dtype)

    # Per-iteration macrostate-macrostate fluxes, for correlation calculation
    macro_fluxes = numpy.empty((niters, nstates, nstates), weight_dtype)

    # Source datasets
    pops_ds = assignments_file['labeled_populations']
    fluxes_ds = kinetics_file['labeled_bin_fluxes']
    pops_iter_start = pops_ds.attrs.get('iter_start', 1)
    fluxes_iter_start = fluxes_ds.attrs.get('iter_start', 1)

    # prepend 1 so that rank of dest == rank of src
    labeled_fluxes = numpy.empty((1, nstates, nstates, nbins, nbins),
                                 weight_dtype)
    labeled_pops = numpy.empty((1, nstates, nbins), weight_dtype)

    lflux_memsel = h5s.create_simple(labeled_fluxes.shape,
                                     (h5s.UNLIMITED, ) * labeled_fluxes.ndim)
    lpop_memsel = h5s.create_simple(labeled_pops.shape,
                                    (h5s.UNLIMITED, ) * labeled_pops.ndim)

    fluxes_dsid = fluxes_ds.id
    pops_dsid = pops_ds.id

    lflux_filesel = fluxes_dsid.get_space()
    lpop_filesel = pops_dsid.get_space()

    # Overall average
    for iiter, n_iter in enumerate(xrange(start_iter, stop_iter)):
        lflux_filesel.select_hyperslab(
            (n_iter - fluxes_iter_start, 0, 0, 0, 0),
            (1, nstates, nstates, nbins, nbins),
            op=h5s.SELECT_SET)
        lpop_filesel.select_hyperslab((n_iter - pops_iter_start, 0, 0),
                                      (1, nstates, nbins),
                                      op=h5s.SELECT_SET)
        fluxes_dsid.read(lflux_memsel, lflux_filesel, labeled_fluxes)
        pops_dsid.read(lpop_memsel, lpop_filesel, labeled_pops)
        avg_fluxes += labeled_fluxes[0]
        avg_pops += labeled_pops[0]
        macro_fluxes[iiter] = labeled_fluxes[0].sum(axis=3).sum(axis=2)

    avg_fluxes /= niters
    avg_pops /= niters
    avg_rates = labeled_flux_to_rate(avg_fluxes, avg_pops)
    ss, macro_rates = get_macrostate_rates(avg_rates, avg_pops, extrapolate)
    overall_avg_rates = macro_rates.copy()
    ctime = mcbs_correltime(macro_fluxes[istate, jstate], mcbs_acalpha,
                            mcbs_nsets)

    # bootstrap
    lbi = int(math.floor(mcbs_nsets * mcbs_alpha / 2.0))
    ubi = int(math.ceil(mcbs_nsets * (1 - mcbs_alpha / 2.0)))
    stride = ctime + 1
    synth_rates = numpy.empty((mcbs_nsets, ), weight_dtype)

    starts = numpy.arange(start_iter, stop_iter, stride, dtype=numpy.uintc)
    stops = numpy.arange(start_iter + stride,
                         stop_iter + stride,
                         stride,
                         dtype=numpy.uintc)
    nblocks = len(starts)
    if stops[-1] > stop_iter: stops[-1] = stop_iter

    for iset in xrange(mcbs_nsets):
        avg_fluxes.fill(0)
        avg_pops.fill(0)
        iters_averaged = 0
        log.debug('iset={} istate={} jstate={}'.format(iset, istate, jstate))

        for _block in xrange(nblocks):
            iblock = random.randint(0, nblocks - 1)
            for n_iter in xrange(starts[iblock], stops[iblock]):
                iters_averaged += 1

                lflux_filesel.select_hyperslab(
                    (n_iter - fluxes_iter_start, 0, 0, 0, 0),
                    (1, nstates, nstates, nbins, nbins),
                    op=h5s.SELECT_SET)
                lpop_filesel.select_hyperslab((n_iter - pops_iter_start, 0, 0),
                                              (1, nstates, nbins),
                                              op=h5s.SELECT_SET)
                fluxes_dsid.read(lflux_memsel, lflux_filesel, labeled_fluxes)
                pops_dsid.read(lpop_memsel, lpop_filesel, labeled_pops)
                avg_fluxes += labeled_fluxes[0]
                avg_pops += labeled_pops[0]

        avg_fluxes /= iters_averaged
        avg_pops /= iters_averaged
        avg_rates = labeled_flux_to_rate(avg_fluxes, avg_pops)
        ss, macro_rates = get_macrostate_rates(avg_rates, avg_pops,
                                               extrapolate)
        synth_rates[iset] = macro_rates[istate, jstate]
    synth_rates.sort()

    return (block_label, istate, jstate,
            (start_iter, stop_iter, overall_avg_rates[istate, jstate],
             synth_rates[lbi], synth_rates[ubi], ctime))
Пример #32
0
    def __getitem__(self, args):
        """ Read a slice from the HDF5 dataset.

        Takes slices and recarray-style field names (more than one is
        allowed!) in any order.  Obeys basic NumPy rules, including
        broadcasting.

        Also supports:

        * Boolean "mask" array indexing
        """
        args = args if isinstance(args, tuple) else (args,)

        # Sort field indices from the rest of the args.
        names = tuple(x for x in args if isinstance(x, str))
        args = tuple(x for x in args if not isinstance(x, str))

        def strip_fields(basetype):
            """ Strip extra dtype information from special types """
            if basetype.kind == 'O':
                return numpy.dtype('O')
            if basetype.fields is not None:
                if basetype.kind in ('i','u'):
                    return basetype.fields['enum'][0]
                fields = []
                for name in basetype.names:
                    fff = basetype.fields[name]
                    if len(fff) == 3:
                        (subtype, offset, meta) = fff
                    else:
                        subtype, meta = fff
                        offset = 0
                    subtype = strip_fields(subtype)
                    fields.append((name, subtype))
                return numpy.dtype(fields)
            return basetype

        def readtime_dtype(basetype, names):
            """ Make a NumPy dtype appropriate for reading """

            basetype = strip_fields(basetype)

            if len(names) == 0:  # Not compound, or we want all fields
                return basetype

            if basetype.names is None:  # Names provided, but not compound
                raise ValueError("Field names only allowed for compound types")

            for name in names:  # Check all names are legal
                if not name in basetype.names:
                    raise ValueError("Field %s does not appear in this type." % name)

            return numpy.dtype([(name, basetype.fields[name][0]) for name in names])

        # This is necessary because in the case of array types, NumPy
        # discards the array information at the top level.
        new_dtype = readtime_dtype(self.id.dtype, names)
        mtype = h5t.py_create(new_dtype)

        # === Scalar dataspaces =================

        if self.shape == ():
            fspace = self.id.get_space()
            selection = sel2.select_read(fspace, args)
            arr = numpy.ndarray(selection.mshape, dtype=new_dtype)
            for mspace, fspace in selection:
                self.id.read(mspace, fspace, arr, mtype)
            if selection.mshape is None:
                return arr[()]
            return arr

        # === Everything else ===================

        # Perform the dataspace selection.
        selection = sel.select(self.shape, args, dsid=self.id)

        if selection.nselect == 0:
            return numpy.ndarray((0,), dtype=new_dtype)

        # Up-converting to (1,) so that numpy.ndarray correctly creates
        # np.void rows in case of multi-field dtype. (issue 135)
        single_element = selection.mshape == ()
        mshape = (1,) if single_element else selection.mshape
        arr = numpy.ndarray(mshape, new_dtype, order='C')

        # HDF5 has a bug where if the memory shape has a different rank
        # than the dataset, the read is very slow
        if len(mshape) < len(self.shape):
            # pad with ones
            mshape = (1,)*(len(self.shape)-len(mshape)) + mshape

        # Perfom the actual read
        mspace = h5s.create_simple(mshape)
        fspace = selection._id
        self.id.read(mspace, fspace, arr, mtype)

        # Patch up the output for NumPy
        if len(names) == 1:
            arr = arr[names[0]]     # Single-field recarray convention
        if arr.shape == ():
            arr = numpy.asscalar(arr)
        if single_element:
            arr = arr[0]
        return arr
Пример #33
0
    def __setitem__(self, args, val):
        """ Write to the HDF5 dataset from a Numpy array.

        NumPy's broadcasting rules are honored, for "simple" indexing
        (slices and integers).  For advanced indexing, the shapes must
        match.
        """
        args = args if isinstance(args, tuple) else (args,)

        # Sort field indices from the slicing
        names = tuple(x for x in args if isinstance(x, str))
        args = tuple(x for x in args if not isinstance(x, str))

        # Generally we try to avoid converting the arrays on the Python
        # side.  However, for compound literals this is unavoidable.
        if self.dtype.kind == "O" or \
          (self.dtype.kind == 'V' and \
          (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \
          (self.dtype.subdtype == None)):
            if len(names) == 1 and self.dtype.fields is not None:
                # Single field selected for write, from a non-array source
                if not names[0] in self.dtype.fields:
                    raise ValueError("No such field for indexing: %s" % names[0])
                dtype = self.dtype.fields[names[0]][0]
                cast_compound = True
            else:
                dtype = self.dtype
                cast_compound = False

            val = numpy.asarray(val, dtype=dtype, order='C')
            if cast_compound:
                val = val.astype(numpy.dtype([(names[0], dtype)]))
        else:
            val = numpy.asarray(val, order='C')

        # Check for array dtype compatibility and convert
        if self.dtype.subdtype is not None:
            shp = self.dtype.subdtype[1]
            valshp = val.shape[-len(shp):]
            if valshp != shp:  # Last dimension has to match
                raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))
            mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
            mshape = val.shape[0:len(val.shape)-len(shp)]

        # Make a compound memory type if field-name slicing is required
        elif len(names) != 0:

            mshape = val.shape

            # Catch common errors
            if self.dtype.fields is None:
                raise TypeError("Illegal slicing argument (not a compound dataset)")
            mismatch = [x for x in names if x not in self.dtype.fields]
            if len(mismatch) != 0:
                mismatch = ", ".join('"%s"'%x for x in mismatch)
                raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)
        
            # Write non-compound source into a single dataset field
            if len(names) == 1 and val.dtype.fields is None:
                subtype = h5y.py_create(val.dtype)
                mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
                mtype.insert(self._e(names[0]), 0, subtype)

            # Make a new source type keeping only the requested fields
            else:
                fieldnames = [x for x in val.dtype.names if x in names] # Keep source order
                mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
                for fieldname in fieldnames:
                    subtype = h5t.py_create(val.dtype.fields[fieldname][0])
                    offset = val.dtype.fields[fieldname][1]
                    mtype.insert(self._e(fieldname), offset, subtype)

        # Use mtype derived from array (let DatasetID.write figure it out)
        else:
            mshape = val.shape
            mtype = None

        # Perform the dataspace selection
        selection = sel.select(self.shape, args, dsid=self.id)

        if selection.nselect == 0:
            return

        # Broadcast scalars if necessary.
        if (mshape == () and selection.mshape != ()):
            if self.dtype.subdtype is not None:
                raise TypeError("Scalar broadcasting is not supported for array dtypes")
            val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype)
            val2[...] = val
            val = val2
            mshape = val.shape

        # Perform the write, with broadcasting
        # Be careful to pad memory shape with ones to avoid HDF5 chunking
        # glitch, which kicks in for mismatched memory/file selections
        if(len(mshape) < len(self.shape)):
            mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape
        else:
            mshape_pad = mshape
        mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad))
        for fspace in selection.broadcast(mshape):
            self.id.write(mspace, fspace, val, mtype)
Пример #34
0
    def __setitem__(self, args, val):
        """ Write to the HDF5 dataset from a Numpy array.

        NumPy's broadcasting rules are honored, for "simple" indexing
        (slices and integers).  For advanced indexing, the shapes must
        match.
        """
        args = args if isinstance(args, tuple) else (args,)

        # Sort field indices from the slicing
        names = tuple(x for x in args if isinstance(x, str))
        args = tuple(x for x in args if not isinstance(x, str))

        if len(names) != 0:
            raise TypeError("Field name selections are not allowed for write.")

        # Generally we try to avoid converting the arrays on the Python
        # side.  However, for compound literals this is unavoidable.
        if self.dtype.kind == "O" or \
          (self.dtype.kind == 'V' and \
          (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \
          (self.dtype.subdtype == None)):
            val = numpy.asarray(val, dtype=self.dtype, order='C')
        else:
            val = numpy.asarray(val, order='C')

        # Check for array dtype compatibility and convert
        if self.dtype.subdtype is not None:
            shp = self.dtype.subdtype[1]
            valshp = val.shape[-len(shp):]
            if valshp != shp:  # Last dimension has to match
                raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))
            mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
            mshape = val.shape[0:len(val.shape)-len(shp)]
        else:
            mshape = val.shape
            mtype = None

        # Perform the dataspace selection
        selection = sel.select(self.shape, args, dsid=self.id)

        if selection.nselect == 0:
            return

        # Broadcast scalars if necessary.
        if (mshape == () and selection.mshape != ()):
            if self.dtype.subdtype is not None:
                raise TypeError("Scalar broadcasting is not supported for array dtypes")
            val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype)
            val2[...] = val
            val = val2
            mshape = val.shape

        # Perform the write, with broadcasting
        # Be careful to pad memory shape with ones to avoid HDF5 chunking
        # glitch, which kicks in for mismatched memory/file selections
        if(len(mshape) < len(self.shape)):
            mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape
        else:
            mshape_pad = mshape
        mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad))
        for fspace in selection.broadcast(mshape):
            self.id.write(mspace, fspace, val, mtype)
Пример #35
0
#RMf = h5py.File(fid)
RMf = h5py.File(RMfn, 'w', driver='mpio', comm=comm)
RMf.atomic = True
#Table for RMSD
RM = RMf.create_dataset(
    'rmsd',
    (N, N),
    dtype=np.float,
    chunks=(l, l))
RM.attrs['chunk'] = l
RMs = RM.id.get_space()


#Init calculations
tS = np.zeros((l, l), dtype=np.float)
ms = h5s.create_simple((l, l))

i, j = rank, rank
ic = S[i * l: (i + 1) * l]
jc = ic

for c in xrange(0, m):
    if rank == 0:
        tit = time.time()

    try:
        assert i == j
        calc_diag_chunk(ic, tS)
    except AssertionError:
        calc_chunk(ic, jc, tS)