def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, buffer_size=None, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) self.buffer_size = buffer_size nbands = bd.nbands self.mynbands = mynbands = bd.mynbands self.blocksize = blocksize # 1D layout - columns self.columngrid = BlacsGrid(self.column_comm, 1, bd.comm.size) self.Nndescriptor = self.columngrid.new_descriptor(nbands, nbands, nbands, mynbands) # 2D layout self.nndescriptor = self.blockgrid.new_descriptor(nbands, nbands, blocksize, blocksize) # 1D layout - rows self.rowgrid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.nNdescriptor = self.rowgrid.new_descriptor(nbands, nbands, mynbands, nbands) # Only redistribute filled out half for Hermitian matrices self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor, self.nndescriptor) #self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor, # self.nndescriptor, 'L') #XXX faster but... # Resulting matrix will be used in dgemm which is symmetry obvlious self.nn2nN = Redistributor(self.block_comm, self.nndescriptor, self.nNdescriptor)
def distribute_to_columns(self, rho_mm, srcdescriptor): redistributor = Redistributor(self.block_comm, # XXX srcdescriptor, self.mM_unique_descriptor) rho_mM = redistributor.redistribute(rho_mm) if self.gd.rank != 0: rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM
def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, nao, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) nbands = bd.nbands self.blocksize = blocksize self.mynbands = mynbands = bd.mynbands self.orbital_comm = self.bd.comm self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size) self.nao = nao # Range of basis functions for BLACS distribution of matrices: self.Mmax = nao self.Mstart = bd.comm.rank * naoblocksize self.Mstop = min(self.Mstart + naoblocksize, self.Mmax) self.mynao = self.Mstop - self.Mstart # Column layout for one matrix per band rank: self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1) self.mMdescriptor = self.columngrid.new_descriptor(nao, nao, naoblocksize, nao) self.nMdescriptor = self.columngrid.new_descriptor(nbands, nao, mynbands, nao) #parallelprint(world, (mynao, self.mMdescriptor.shape)) # Column layout for one matrix in total (only on grid masters): self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.mM_unique_descriptor = self.single_column_grid.new_descriptor( \ nao, nao, naoblocksize, nao) # nM_unique_descriptor is meant to hold the coefficients after # diagonalization. BLACS requires it to be nao-by-nao, but # we only fill meaningful data into the first nbands columns. # # The array will then be trimmed and broadcast across # the grid descriptor's communicator. self.nM_unique_descriptor = self.single_column_grid.new_descriptor( \ nbands, nao, mynbands, nao) # Fully blocked grid for diagonalization with many CPUs: self.mmdescriptor = self.blockgrid.new_descriptor(nao, nao, blocksize, blocksize) #self.nMdescriptor = nMdescriptor self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor, self.mmdescriptor) self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor, self.nM_unique_descriptor)
def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands mynbands = self.bd.mynbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]).conj() else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='T') return rho_mm
def calculate_density_matrix(self, f_n, C_nM, rho_mM=None): """Calculate density matrix from occupations and coefficients. Presently this function performs the usual scalapack 3-step trick: redistribute-numbercrunching-backdistribute. Notes on future performance improvement. As per the current framework, C_nM exists as copies on each domain, i.e. this is not parallel over domains. We'd like to correct this and have an efficient distribution using e.g. the block communicator. The diagonalization routine and other parts of the code should however be changed to accommodate the following scheme: Keep coefficients in C_mm form after the diagonalization. rho_mm can then be directly calculated from C_mm without redistribution, after which we only need to redistribute rho_mm across domains. """ dtype = C_nM.dtype rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM) rback = Redistributor(self.block_comm, self.mmdescriptor, self.mM_unique_descriptor) rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype) rback.redistribute(rho_mm, rho1_mM) del rho_mm if rho_mM is None: if self.gd.rank == 0: rho_mM = rho1_mM else: rho_mM = self.mMdescriptor.zeros(dtype=dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM
class BlacsOrbitalLayouts(BlacsLayouts): """ScaLAPACK Dense Linear Algebra. This class is instantiated in LCAO. Not for casual use, at least for now. Requires two distributors and three descriptors for initialization as well as grid descriptors and band descriptors. Distributors are for cols2blocks (1D -> 2D BLACS grid) and blocks2cols (2D -> 1D BLACS grid). ScaLAPACK operations must occur on 2D BLACS grid for performance and scalability. _general_diagonalize is "hard-coded" for LCAO. Expects both Hamiltonian and Overlap matrix to be on the 2D BLACS grid. This is done early on to save memory. """ # XXX rewrite this docstring a bit! # This class 'describes' all the LCAO Blacs-related layouts def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, nao, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) nbands = bd.nbands self.blocksize = blocksize self.mynbands = mynbands = bd.mynbands self.orbital_comm = self.bd.comm self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size) self.nao = nao # Range of basis functions for BLACS distribution of matrices: self.Mmax = nao self.Mstart = bd.comm.rank * naoblocksize self.Mstop = min(self.Mstart + naoblocksize, self.Mmax) self.mynao = self.Mstop - self.Mstart # Column layout for one matrix per band rank: self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1) self.mMdescriptor = self.columngrid.new_descriptor(nao, nao, naoblocksize, nao) self.nMdescriptor = self.columngrid.new_descriptor(nbands, nao, mynbands, nao) #parallelprint(world, (mynao, self.mMdescriptor.shape)) # Column layout for one matrix in total (only on grid masters): self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.mM_unique_descriptor = self.single_column_grid.new_descriptor( \ nao, nao, naoblocksize, nao) # nM_unique_descriptor is meant to hold the coefficients after # diagonalization. BLACS requires it to be nao-by-nao, but # we only fill meaningful data into the first nbands columns. # # The array will then be trimmed and broadcast across # the grid descriptor's communicator. self.nM_unique_descriptor = self.single_column_grid.new_descriptor( \ nbands, nao, mynbands, nao) # Fully blocked grid for diagonalization with many CPUs: self.mmdescriptor = self.blockgrid.new_descriptor(nao, nao, blocksize, blocksize) #self.nMdescriptor = nMdescriptor self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor, self.mmdescriptor) self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor, self.nM_unique_descriptor) def diagonalize(self, H_mm, C_nM, eps_n, S_mm): # C_nM needs to be simultaneously compatible with: # 1. outdescriptor # 2. broadcast with gd.comm # We will does this with a dummy buffer C2_nM indescriptor = self.mM2mm.srcdescriptor # cols2blocks outdescriptor = self.mm2nM.dstdescriptor # blocks2cols blockdescriptor = self.mM2mm.dstdescriptor # cols2blocks dtype = S_mm.dtype eps_M = np.empty(C_nM.shape[-1]) # empty helps us debug subM, subN = outdescriptor.gshape C_mm = blockdescriptor.zeros(dtype=dtype) self.timer.start('General diagonalize') # general_diagonalize_ex may have a buffer overflow, so # we no longer use it #blockdescriptor.general_diagonalize_ex(H_mm, S_mm.copy(), C_mm, eps_M, # UL='L', iu=self.bd.nbands) blockdescriptor.general_diagonalize_dc(H_mm, S_mm.copy(), C_mm, eps_M, UL='L') self.timer.stop('General diagonalize') # Make C_nM compatible with the redistributor self.timer.start('Redistribute coefs') if outdescriptor: C2_nM = C_nM else: C2_nM = outdescriptor.empty(dtype=dtype) assert outdescriptor.check(C2_nM) self.mm2nM.redistribute(C_mm, C2_nM, subM, subN) # blocks2cols self.timer.stop('Redistribute coefs') self.timer.start('Send coefs to domains') # eps_M is already on block_comm.rank = 0 # easier to broadcast eps_M to all and # get the correct slice afterward. self.block_comm.broadcast(eps_M, 0) eps_n[:] = eps_M[self.bd.get_slice()] self.gd.comm.broadcast(C_nM, 0) self.timer.stop('Send coefs to domains') def distribute_overlap_matrix(self, S_qmM, root=0, add_hermitian_conjugate=False): # Some MPI implementations need a lot of memory to do large # reductions. To avoid trouble, we do comm.sum on smaller blocks # of S (this code is also safe for arrays smaller than blocksize) Sflat_x = S_qmM.ravel() blocksize = 2**23 // Sflat_x.itemsize # 8 MiB nblocks = -(-len(Sflat_x) // blocksize) Mstart = 0 for i in range(nblocks): self.gd.comm.sum(Sflat_x[Mstart:Mstart + blocksize], root=root) Mstart += blocksize assert Mstart + blocksize >= len(Sflat_x) xshape = S_qmM.shape[:-2] nm, nM = S_qmM.shape[-2:] S_qmM = S_qmM.reshape(-1, nm, nM) blockdesc = self.mmdescriptor coldesc = self.mM_unique_descriptor S_qmm = blockdesc.zeros(len(S_qmM), S_qmM.dtype) if not coldesc: # XXX ugly way to sort out inactive ranks S_qmM = coldesc.zeros(len(S_qmM), S_qmM.dtype) self.timer.start('Distribute overlap matrix') for S_mM, S_mm in zip(S_qmM, S_qmm): self.mM2mm.redistribute(S_mM, S_mm) if add_hermitian_conjugate: if blockdesc.active: pblas_tran(1.0, S_mm.copy(), 1.0, S_mm, blockdesc, blockdesc) self.timer.stop('Distribute overlap matrix') return S_qmm.reshape(xshape + blockdesc.shape) def get_overlap_matrix_shape(self): return self.mmdescriptor.shape def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands mynbands = self.bd.mynbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]).conj() else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='T') return rho_mm def calculate_density_matrix(self, f_n, C_nM, rho_mM=None): """Calculate density matrix from occupations and coefficients. Presently this function performs the usual scalapack 3-step trick: redistribute-numbercrunching-backdistribute. Notes on future performance improvement. As per the current framework, C_nM exists as copies on each domain, i.e. this is not parallel over domains. We'd like to correct this and have an efficient distribution using e.g. the block communicator. The diagonalization routine and other parts of the code should however be changed to accommodate the following scheme: Keep coefficients in C_mm form after the diagonalization. rho_mm can then be directly calculated from C_mm without redistribution, after which we only need to redistribute rho_mm across domains. """ dtype = C_nM.dtype rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM) rback = Redistributor(self.block_comm, self.mmdescriptor, self.mM_unique_descriptor) rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype) rback.redistribute(rho_mm, rho1_mM) del rho_mm if rho_mM is None: if self.gd.rank == 0: rho_mM = rho1_mM else: rho_mM = self.mMdescriptor.zeros(dtype=dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM def distribute_to_columns(self, rho_mm, srcdescriptor): redistributor = Redistributor(self.block_comm, # XXX srcdescriptor, self.mM_unique_descriptor) rho_mM = redistributor.redistribute(rho_mm) if self.gd.rank != 0: rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM def oldcalculate_density_matrix(self, f_n, C_nM, rho_mM=None): # This version is parallel over the band descriptor only. # This is inefficient, but let's keep it for a while in case # there's trouble with the more efficient version nbands = self.bd.nbands mynbands = self.bd.mynbands nao = self.nao if rho_mM is None: rho_mM = self.mMdescriptor.zeros(dtype=C_nM.dtype) Cf_nM = (C_nM * f_n[:, None]).conj() pblas_simple_gemm(self.nMdescriptor, self.nMdescriptor, self.mMdescriptor, Cf_nM, C_nM, rho_mM, transa='T') return rho_mM def get_transposed_density_matrix(self, f_n, C_nM, rho_mM=None): return self.calculate_density_matrix(f_n, C_nM, rho_mM).conj() def get_description(self): (title, template) = BlacsLayouts.get_description(self) bg = self.blockgrid desc = self.mmdescriptor s = template % (bg.nprow, bg.npcol, desc.mb, desc.nb) return ' '.join([title, s])