def scalapack_diagonalize(self, H_sS): mb = 32 N = self.nS g1 = BlacsGrid(world, size, 1) g2 = BlacsGrid(world, size//2, 2) nndesc1 = g1.new_descriptor(N, N, self.nS_local, N) nndesc2 = g2.new_descriptor(N, N, mb, mb) A_ss = nndesc2.empty(dtype=H_sS.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(H_sS, A_ss) # diagonalize v_ss = nndesc2.zeros(dtype=A_ss.dtype) w_S = np.zeros(N,dtype=float) nndesc2.diagonalize_dc(A_ss, v_ss, w_S, 'L') # distribute the eigenvectors to master v_sS = np.zeros_like(H_sS) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(v_ss, v_sS) # v2_SS = np.zeros((self.nS, self.nS), dtype=complex) # world.all_gather(v_sS, v2_SS) return w_S, v_sS.conj()
def __init__(self, sl_lrtddft, nrows, lr_comms): self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft) self.lr_comms = lr_comms # original grid, ie, how matrix is stored self.matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.lr_comms.dd_comm.size, self.lr_comms.eh_comm.size) # diagonalization grid self.diag_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # # M = rows, N = cols M = nrows N = nrows mb = 1 nb = 1 self.matrix_descr = self.matrix_grid.new_descriptor(N, M, nb, mb) bs = self.block_size self.diag_descr = self.diag_grid.new_descriptor(N, M, bs, bs) self.diag_in_redist = Redistributor(self.lr_comms.parent_comm, self.matrix_descr, self.diag_descr) self.diag_out_redist = Redistributor(self.lr_comms.parent_comm, self.diag_descr, self.matrix_descr)
def diagonalize(self): print('Diagonalizing Hamiltonian', file=self.fd) """The t and T represent local and global eigenstates indices respectively """ # Non-Hermitian matrix can only use linalg.eig if not self.td: print(' Using numpy.linalg.eig...', file=self.fd) print(' Eliminated %s pair orbitals' % len(self.excludef_S), file=self.fd) self.H_SS = self.collect_A_SS(self.H_sS) self.w_T = np.zeros(self.nS - len(self.excludef_S), complex) if world.rank == 0: self.H_SS = np.delete(self.H_SS, self.excludef_S, axis=0) self.H_SS = np.delete(self.H_SS, self.excludef_S, axis=1) self.w_T, self.v_ST = np.linalg.eig(self.H_SS) world.broadcast(self.w_T, 0) self.df_S = np.delete(self.df_S, self.excludef_S) self.rhoG0_S = np.delete(self.rhoG0_S, self.excludef_S) # Here the eigenvectors are returned as complex conjugated rows else: if world.size == 1: print(' Using lapack...', file=self.fd) from gpaw.utilities.lapack import diagonalize self.w_T = np.zeros(self.nS) diagonalize(self.H_sS, self.w_T) self.v_St = self.H_sS.conj().T else: print(' Using scalapack...', file=self.fd) nS = self.nS ns = -(-self.kd.nbzkpts // world.size) * (self.nv * self.nc * self.spins * (self.spinors + 1)**2) grid = BlacsGrid(world, world.size, 1) desc = grid.new_descriptor(nS, nS, ns, nS) desc2 = grid.new_descriptor(nS, nS, 2, 2) H_tmp = desc2.zeros(dtype=complex) r = Redistributor(world, desc, desc2) r.redistribute(self.H_sS, H_tmp) self.w_T = np.empty(nS) v_tmp = desc2.empty(dtype=complex) desc2.diagonalize_dc(H_tmp, v_tmp, self.w_T) r = Redistributor(grid.comm, desc2, desc) self.v_St = desc.zeros(dtype=complex) r.redistribute(v_tmp, self.v_St) self.v_St = self.v_St.conj().T if self.write_v and self.td: # Cannot use par_save without td self.par_save('v_TS.ulm', 'v_TS', self.v_St.T) return
def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, nao, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) nbands = bd.nbands self.blocksize = blocksize self.mynbands = mynbands = bd.mynbands self.orbital_comm = self.bd.comm self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size) self.nao = nao # Range of basis functions for BLACS distribution of matrices: self.Mmax = nao self.Mstart = bd.comm.rank * naoblocksize self.Mstop = min(self.Mstart + naoblocksize, self.Mmax) self.mynao = self.Mstop - self.Mstart # Column layout for one matrix per band rank: self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1) self.mMdescriptor = self.columngrid.new_descriptor( nao, nao, naoblocksize, nao) self.nMdescriptor = self.columngrid.new_descriptor( nbands, nao, mynbands, nao) #parallelprint(world, (mynao, self.mMdescriptor.shape)) # Column layout for one matrix in total (only on grid masters): self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.mM_unique_descriptor = self.single_column_grid.new_descriptor( \ nao, nao, naoblocksize, nao) # nM_unique_descriptor is meant to hold the coefficients after # diagonalization. BLACS requires it to be nao-by-nao, but # we only fill meaningful data into the first nbands columns. # # The array will then be trimmed and broadcast across # the grid descriptor's communicator. self.nM_unique_descriptor = self.single_column_grid.new_descriptor( \ nbands, nao, mynbands, nao) # Fully blocked grid for diagonalization with many CPUs: self.mmdescriptor = self.blockgrid.new_descriptor( nao, nao, blocksize, blocksize) #self.nMdescriptor = nMdescriptor self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor, self.mmdescriptor) self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor, self.nM_unique_descriptor)
def redistribute_H(self, H_sS): g1 = BlacsGrid(world, size, 1) g2 = BlacsGrid(world, 1, size) N = self.nS nndesc1 = g1.new_descriptor(N, N, self.nS_local, N) nndesc2 = g2.new_descriptor(N, N, N, self.nS_local) H_Ss = nndesc2.empty(dtype=H_sS.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(H_sS, H_Ss) return H_Ss
def main(nbands=1000, mprocs=2, mb=64): # Set-up BlacsGrud grid = BlacsGrid(world, mprocs, mprocs) # Create descriptor nndesc = grid.new_descriptor(nbands, nbands, mb, mb) H_nn = nndesc.empty( dtype=float) # outside the BlacsGrid these are size zero C_nn = nndesc.empty( dtype=float) # outside the BlacsGrid these are size zero eps_N = np.empty((nbands), dtype=float) # replicated array on all MPI tasks # Fill ScaLAPACK array alpha = 0.1 # off-diagonal beta = 75.0 # diagonal uplo = 'L' # lower-triangular scalapack_set(nndesc, H_nn, alpha, beta, uplo) scalapack_zero(nndesc, H_nn, switch_uplo[uplo]) t1 = time() # either interface will work, we recommend use the latter interface # scalapack_diagonalize_dc(nndesc, H_nn.copy(), C_nn, eps_N, 'L') nndesc.diagonalize_dc(H_nn.copy(), C_nn, eps_N) t2 = time() world.broadcast(eps_N, 0) # all MPI tasks now have eps_N world.barrier() # wait for everyone to finish if rank == 0: print('ScaLAPACK diagonalize_dc', t2 - t1) # Create replicated NumPy array diagonal = np.eye(nbands, dtype=float) offdiagonal = np.tril(np.ones((nbands, nbands)), -1) H0 = beta * diagonal + alpha * offdiagonal E0 = np.empty((nbands), dtype=float) t1 = time() diagonalize(H0, E0) t2 = time() if rank == 0: print('LAPACK diagonalize', t2 - t1) delta = abs(E0 - eps_N).max() if rank == 0: print(delta) assert delta < tol
def parallel_eigh(matrixfile, blacsgrid=(4, 2), blocksize=64): """Diagonalize matrix in parallel""" assert np.prod(blacsgrid) == world.size grid = BlacsGrid(world, *blacsgrid) if world.rank == MASTER: H_MM = np.load(matrixfile) assert H_MM.ndim == 2 assert H_MM.shape[0] == H_MM.shape[1] NM = len(H_MM) else: NM = 0 NM = world.sum(NM) # Distribute matrix shape to all nodes # descriptor for the individual blocks block_desc = grid.new_descriptor(NM, NM, blocksize, blocksize) # descriptor for global array on MASTER local_desc = grid.new_descriptor(NM, NM, NM, NM) # Make some dummy array on all the slaves if world.rank != MASTER: H_MM = local_desc.zeros() assert local_desc.check(H_MM) # The local version of the matrix H_mm = block_desc.empty() # Distribute global array to smaller blocks redistributor = Redistributor(world, local_desc, block_desc) redistributor.redistribute(H_MM, H_mm) # Allocate arrays for eigenvalues and -vectors eps_M = np.empty(NM) C_mm = block_desc.empty() block_desc.diagonalize_ex(H_mm, C_mm, eps_M) # Collect eigenvectors on MASTER C_MM = local_desc.empty() redistributor2 = Redistributor(world, block_desc, local_desc) redistributor2.redistribute(C_mm, C_MM) # Return eigenvalues and -vectors on Master if world.rank == MASTER: return eps_M, C_MM else: return None, None
def distribute_MM(wfs, a_MM): ksl = wfs.ksl if not ksl.using_blacs: return a_MM dtype = a_MM.dtype ksl_comm = ksl.block_comm NM = ksl.nao grid = BlacsGrid(ksl_comm, 1, 1) MM_descriptor = grid.new_descriptor(NM, NM, NM, NM) MM2mm = Redistributor(ksl_comm, MM_descriptor, ksl.mmdescriptor) if ksl_comm.rank != 0: a_MM = MM_descriptor.empty(dtype=dtype) a_mm = ksl.mmdescriptor.empty(dtype=dtype) MM2mm.redistribute(a_MM, a_mm) return a_mm
def redistribute(self, in_wGG, out_x=None): """Redistribute array. Switch between two kinds of parallel distributions: 1) parallel over G-vectors (second dimension of in_wGG) 2) parallel over frequency (first dimension of in_wGG) Returns new array using the memory in the 1-d array out_x. """ comm = self.blockcomm if comm.size == 1: return in_wGG nw = len(self.omega_w) nG = in_wGG.shape[2] mynw = (nw + comm.size - 1) // comm.size mynG = (nG + comm.size - 1) // comm.size bg1 = BlacsGrid(comm, comm.size, 1) bg2 = BlacsGrid(comm, 1, comm.size) md1 = BlacsDescriptor(bg1, nw, nG**2, mynw, nG**2) md2 = BlacsDescriptor(bg2, nw, nG**2, nw, mynG * nG) if len(in_wGG) == nw: mdin = md2 mdout = md1 else: mdin = md1 mdout = md2 r = Redistributor(comm, mdin, mdout) outshape = (mdout.shape[0], mdout.shape[1] // nG, nG) if out_x is None: out_wGG = np.empty(outshape, complex) else: out_wGG = out_x[:np.product(outshape)].reshape(outshape) r.redistribute(in_wGG.reshape(mdin.shape), out_wGG.reshape(mdout.shape)) return out_wGG
def scal_diagonalize(A, nodes='master'): # Diagonalize matrix A (size N*N) with scalapack # Usage: eps, B = scal_diagonalize(A) # eps and B and the eigenvalues and eigenvectors # nodes = 'master': eigenvectors only available on master node # nodes = 'all': eigenvectors broadcast to all nodes # make sure A is N*N, and hermitian N = A.shape[0] assert A.shape[0] == A.shape[1] for i in range(N): for j in range(i, N): assert A[i,j] == A[j,i].conj() # create blacs descriptor mb = 64 g = BlacsGrid(world, 2, size//2) nndesc1 = g.new_descriptor(N, N, N, N) nndesc2 = g.new_descriptor(N, N, mb, mb) # distribute A to blacs grid A_ if rank != 0: A = nndesc1.zeros(dtype=A.dtype) A_ = nndesc2.empty(dtype=A.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(A, A_) # diagonalize B_ = nndesc2.zeros(dtype=A.dtype) eps = np.zeros(N,dtype=A.dtype) nndesc2.diagonalize_dc(A_, B_, eps, 'L') # distribute the eigenvectors to master B = np.zeros_like(A) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(B_, B) if nodes == 'master': return eps, B elif nodes == 'all': if rank != 0: B = np.zeros((N, N)) world.broadcast(B, 0) return eps, B
def main(nbands=1000, mprocs=2, mb=64): # Set-up BlacsGrud grid = BlacsGrid(world, mprocs, mprocs) # Create descriptor nndesc = grid.new_descriptor(nbands, nbands, mb, mb) H_nn = nndesc.empty(dtype=float) # outside the BlacsGrid these are size zero C_nn = nndesc.empty(dtype=float) # outside the BlacsGrid these are size zero eps_N = np.empty((nbands), dtype=float) # replicated array on all MPI tasks # Fill ScaLAPACK array alpha = 0.1 # off-diagonal beta = 75.0 # diagonal uplo = 'L' # lower-triangular scalapack_set(nndesc, H_nn, alpha, beta, uplo) scalapack_zero(nndesc, H_nn, switch_uplo[uplo]) t1 = time() # either interface will work, we recommend use the latter interface # scalapack_diagonalize_dc(nndesc, H_nn.copy(), C_nn, eps_N, 'L') nndesc.diagonalize_dc(H_nn.copy(), C_nn, eps_N) t2 = time() world.broadcast(eps_N, 0) # all MPI tasks now have eps_N world.barrier() # wait for everyone to finish if rank == 0: print('ScaLAPACK diagonalize_dc', t2-t1) # Create replicated NumPy array diagonal = np.eye(nbands,dtype=float) offdiagonal = np.tril(np.ones((nbands,nbands)), -1) H0 = beta*diagonal + alpha*offdiagonal E0 = np.empty((nbands), dtype=float) t1 = time() diagonalize(H0,E0) t2 = time() if rank == 0: print('LAPACK diagonalize', t2-t1) delta = abs(E0-eps_N).max() if rank == 0: print(delta) assert delta < tol
def scal_diagonalize(A, nodes='master'): # Diagonalize matrix A (size N*N) with scalapack # Usage: eps, B = scal_diagonalize(A) # eps and B and the eigenvalues and eigenvectors # nodes = 'master': eigenvectors only available on master node # nodes = 'all': eigenvectors broadcast to all nodes # make sure A is N*N, and hermitian N = A.shape[0] assert A.shape[0] == A.shape[1] for i in range(N): for j in range(i, N): assert A[i, j] == A[j, i].conj() # create blacs descriptor mb = 64 g = BlacsGrid(world, 2, size // 2) nndesc1 = g.new_descriptor(N, N, N, N) nndesc2 = g.new_descriptor(N, N, mb, mb) # distribute A to blacs grid A_ if rank != 0: A = nndesc1.zeros(dtype=A.dtype) A_ = nndesc2.empty(dtype=A.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(A, A_) # diagonalize B_ = nndesc2.zeros(dtype=A.dtype) eps = np.zeros(N, dtype=A.dtype) nndesc2.diagonalize_dc(A_, B_, eps, 'L') # distribute the eigenvectors to master B = np.zeros_like(A) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(B_, B) if nodes == 'master': return eps, B elif nodes == 'all': if rank != 0: B = np.zeros((N, N)) world.broadcast(B, 0) return eps, B
def test(comm, M, N, mcpus, ncpus, mb, nb): grid0 = BlacsGrid(comm, 1, 1) desc0 = grid0.new_descriptor(M, N, M, N, 0, 0) A_mn = desc0.zeros(dtype=float) A_mn[:] = comm.size + 1 grid1 = BlacsGrid(comm, mcpus, ncpus) desc1 = grid1.new_descriptor(M, N, mb, nb, 0, 0) # ??? B_mn = desc1.zeros(dtype=float) B_mn[:] = comm.rank if comm.rank == 0: msg = 'Slices of global matrix indices by rank' print msg print '-' * len(msg) for rank in range(comm.size): comm.barrier() if rank == comm.rank: print 'Rank %d:' % rank last_Mstart = -1 for Mstart, Mstop, Nstart, Nstop, block in desc1.my_blocks(B_mn): if Mstart > last_Mstart and last_Mstart >= 0: print print '[%3d:%3d, %3d:%3d]' % (Mstart, Mstop, Nstart, Nstop), last_Mstart = Mstart assert (block == comm.rank).all() #print block #print print print comm.barrier() redistributor = Redistributor(comm, desc1, desc0) redistributor.redistribute(B_mn, A_mn) if comm.rank == 0: msg = 'Rank where each element of the global matrix is stored' print msg print '-' * len(msg) print A_mn
def scalapack_diagonalize(self, H_sS): mb = 32 N = self.nS g1 = BlacsGrid(world, size, 1) g2 = BlacsGrid(world, size // 2, 2) nndesc1 = g1.new_descriptor(N, N, self.nS_local, N) nndesc2 = g2.new_descriptor(N, N, mb, mb) A_ss = nndesc2.empty(dtype=H_sS.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(H_sS, A_ss) # diagonalize v_ss = nndesc2.zeros(dtype=A_ss.dtype) w_S = np.zeros(N, dtype=float) nndesc2.diagonalize_dc(A_ss, v_ss, w_S, 'L') # distribute the eigenvectors to master v_sS = np.zeros_like(H_sS) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(v_ss, v_sS) # v2_SS = np.zeros((self.nS, self.nS), dtype=complex) # world.all_gather(v_sS, v2_SS) return w_S, v_sS.conj()
def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, buffer_size=None, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) self.buffer_size = buffer_size nbands = bd.nbands self.mynbands = mynbands = bd.mynbands self.blocksize = blocksize # 1D layout - columns self.columngrid = BlacsGrid(self.column_comm, 1, bd.comm.size) self.Nndescriptor = self.columngrid.new_descriptor( nbands, nbands, nbands, mynbands) # 2D layout self.nndescriptor = self.blockgrid.new_descriptor( nbands, nbands, blocksize, blocksize) # 1D layout - rows self.rowgrid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.nNdescriptor = self.rowgrid.new_descriptor( nbands, nbands, mynbands, nbands) # Only redistribute filled out half for Hermitian matrices self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor, self.nndescriptor) #self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor, # self.nndescriptor, 'L') #XXX faster but... # Resulting matrix will be used in dgemm which is symmetry obvlious self.nn2nN = Redistributor(self.block_comm, self.nndescriptor, self.nNdescriptor)
def collect_wuMM(wfs, a_wuMM, w, s, k): # This function is based on # gpaw/wavefunctions/base.py: WaveFunctions.collect_auxiliary() dtype = a_wuMM[0][0].dtype ksl = wfs.ksl NM = ksl.nao kpt_rank, u = wfs.kd.get_rank_and_index(s, k) ksl_comm = ksl.block_comm if wfs.kd.comm.rank == kpt_rank: a_MM = a_wuMM[w][u] # Collect within blacs grid if ksl.using_blacs: a_mm = a_MM grid = BlacsGrid(ksl_comm, 1, 1) MM_descriptor = grid.new_descriptor(NM, NM, NM, NM) mm2MM = Redistributor(ksl_comm, ksl.mmdescriptor, MM_descriptor) a_MM = MM_descriptor.empty(dtype=dtype) mm2MM.redistribute(a_mm, a_MM) # KSL master send a_MM to the global master if ksl_comm.rank == 0: if kpt_rank == 0: assert wfs.world.rank == 0 # I have it already return a_MM else: wfs.kd.comm.send(a_MM, 0, 2017) return None elif ksl_comm.rank == 0 and kpt_rank != 0: assert wfs.world.rank == 0 a_MM = np.empty((NM, NM), dtype=dtype) wfs.kd.comm.receive(a_MM, kpt_rank, 2017) return a_MM
def distribute_frequencies(self, chi0_wGG): """Distribute frequencies to all cores.""" world = self.world comm = self.blockcomm if world.size == 1: return chi0_wGG nw = len(self.omega_w) nG = chi0_wGG.shape[2] mynw = (nw + world.size - 1) // world.size mynG = (nG + comm.size - 1) // comm.size wa = min(world.rank * mynw, nw) wb = min(wa + mynw, nw) if self.blockcomm.size == 1: return chi0_wGG[wa:wb].copy() if self.kncomm.rank == 0: bg1 = BlacsGrid(comm, 1, comm.size) in_wGG = chi0_wGG.reshape((nw, -1)) else: bg1 = DryRunBlacsGrid(mpi.serial_comm, 1, 1) in_wGG = np.zeros((0, 0), complex) md1 = BlacsDescriptor(bg1, nw, nG**2, nw, mynG * nG) bg2 = BlacsGrid(world, world.size, 1) md2 = BlacsDescriptor(bg2, nw, nG**2, mynw, nG**2) r = Redistributor(world, md1, md2) shape = (wb - wa, nG, nG) out_wGG = np.empty(shape, complex) r.redistribute(in_wGG, out_wGG.reshape((wb - wa, nG**2))) return out_wGG
def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer=nulltimer): KohnShamLayouts.__init__(self, gd, bd, block_comm, dtype, timer) # WARNING: Do not create the BlacsGrid on a communicator which does not # contain block_comm.rank = 0. This will break BlacsBandLayouts which # assume eps_M will be broadcast over block_comm. self.blocksize = blocksize self.blockgrid = BlacsGrid(self.block_comm, mcpus, ncpus)
def test(comm, M, N, mcpus, ncpus, mb, nb): grid0 = BlacsGrid(comm, 1, 1) desc0 = grid0.new_descriptor(M, N, M, N, 0, 0) A_mn = desc0.zeros(dtype=float) A_mn[:] = comm.size + 1 grid1 = BlacsGrid(comm, mcpus, ncpus) desc1 = grid1.new_descriptor(M, N, mb, nb, 0, 0) # ??? B_mn = desc1.zeros(dtype=float) B_mn[:] = comm.rank if comm.rank == 0: msg = 'Slices of global matrix indices by rank' print(msg) print('-' * len(msg)) for rank in range(comm.size): comm.barrier() if rank == comm.rank: print('Rank %d:' % rank) last_Mstart = -1 for Mstart, Mstop, Nstart, Nstop, block in desc1.my_blocks(B_mn): if Mstart > last_Mstart and last_Mstart >= 0: print() print('[%3d:%3d, %3d:%3d]' % (Mstart, Mstop, Nstart, Nstop), end=' ') last_Mstart = Mstart assert (block == comm.rank).all() #print block #print print() print() comm.barrier() redistributor = Redistributor(comm, desc1, desc0) redistributor.redistribute(B_mn, A_mn) if comm.rank == 0: msg = 'Rank where each element of the global matrix is stored' print(msg) print('-' * len(msg)) print(A_mn)
class BlacsOrbitalLayouts(BlacsLayouts): """ScaLAPACK Dense Linear Algebra. This class is instantiated in LCAO. Not for casual use, at least for now. Requires two distributors and three descriptors for initialization as well as grid descriptors and band descriptors. Distributors are for cols2blocks (1D -> 2D BLACS grid) and blocks2cols (2D -> 1D BLACS grid). ScaLAPACK operations must occur on 2D BLACS grid for performance and scalability. _general_diagonalize is "hard-coded" for LCAO. Expects both Hamiltonian and Overlap matrix to be on the 2D BLACS grid. This is done early on to save memory. """ # XXX rewrite this docstring a bit! # This class 'describes' all the LCAO Blacs-related layouts def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, nao, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) nbands = bd.nbands self.blocksize = blocksize self.mynbands = mynbands = bd.mynbands self.orbital_comm = self.bd.comm self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size) self.nao = nao # Range of basis functions for BLACS distribution of matrices: self.Mmax = nao self.Mstart = bd.comm.rank * naoblocksize self.Mstop = min(self.Mstart + naoblocksize, self.Mmax) self.mynao = self.Mstop - self.Mstart # Column layout for one matrix per band rank: self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1) self.mMdescriptor = self.columngrid.new_descriptor( nao, nao, naoblocksize, nao) self.nMdescriptor = self.columngrid.new_descriptor( nbands, nao, mynbands, nao) #parallelprint(world, (mynao, self.mMdescriptor.shape)) # Column layout for one matrix in total (only on grid masters): self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.mM_unique_descriptor = self.single_column_grid.new_descriptor( \ nao, nao, naoblocksize, nao) # nM_unique_descriptor is meant to hold the coefficients after # diagonalization. BLACS requires it to be nao-by-nao, but # we only fill meaningful data into the first nbands columns. # # The array will then be trimmed and broadcast across # the grid descriptor's communicator. self.nM_unique_descriptor = self.single_column_grid.new_descriptor( \ nbands, nao, mynbands, nao) # Fully blocked grid for diagonalization with many CPUs: self.mmdescriptor = self.blockgrid.new_descriptor( nao, nao, blocksize, blocksize) #self.nMdescriptor = nMdescriptor self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor, self.mmdescriptor) self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor, self.nM_unique_descriptor) def diagonalize(self, H_mm, C_nM, eps_n, S_mm): # C_nM needs to be simultaneously compatible with: # 1. outdescriptor # 2. broadcast with gd.comm # We will does this with a dummy buffer C2_nM outdescriptor = self.mm2nM.dstdescriptor # blocks2cols blockdescriptor = self.mM2mm.dstdescriptor # cols2blocks dtype = S_mm.dtype eps_M = np.empty(C_nM.shape[-1]) # empty helps us debug subM, subN = outdescriptor.gshape C_mm = blockdescriptor.zeros(dtype=dtype) self.timer.start('General diagonalize') # general_diagonalize_ex may have a buffer overflow, so # we no longer use it #blockdescriptor.general_diagonalize_ex(H_mm, S_mm.copy(), C_mm, eps_M, # UL='L', iu=self.bd.nbands) blockdescriptor.general_diagonalize_dc(H_mm, S_mm.copy(), C_mm, eps_M, UL='L') self.timer.stop('General diagonalize') # Make C_nM compatible with the redistributor self.timer.start('Redistribute coefs') if outdescriptor: C2_nM = C_nM else: C2_nM = outdescriptor.empty(dtype=dtype) assert outdescriptor.check(C2_nM) self.mm2nM.redistribute(C_mm, C2_nM, subM, subN) # blocks2cols self.timer.stop('Redistribute coefs') self.timer.start('Send coefs to domains') # eps_M is already on block_comm.rank = 0 # easier to broadcast eps_M to all and # get the correct slice afterward. self.block_comm.broadcast(eps_M, 0) eps_n[:] = eps_M[self.bd.get_slice()] self.gd.comm.broadcast(C_nM, 0) self.timer.stop('Send coefs to domains') def distribute_overlap_matrix(self, S_qmM, root=0, add_hermitian_conjugate=False): # Some MPI implementations need a lot of memory to do large # reductions. To avoid trouble, we do comm.sum on smaller blocks # of S (this code is also safe for arrays smaller than blocksize) Sflat_x = S_qmM.ravel() blocksize = 2**23 // Sflat_x.itemsize # 8 MiB nblocks = -(-len(Sflat_x) // blocksize) Mstart = 0 self.timer.start('blocked summation') for i in range(nblocks): self.gd.comm.sum(Sflat_x[Mstart:Mstart + blocksize], root=root) Mstart += blocksize assert Mstart + blocksize >= len(Sflat_x) self.timer.stop('blocked summation') xshape = S_qmM.shape[:-2] nm, nM = S_qmM.shape[-2:] S_qmM = S_qmM.reshape(-1, nm, nM) blockdesc = self.mmdescriptor coldesc = self.mM_unique_descriptor S_qmm = blockdesc.zeros(len(S_qmM), S_qmM.dtype) if not coldesc: # XXX ugly way to sort out inactive ranks S_qmM = coldesc.zeros(len(S_qmM), S_qmM.dtype) self.timer.start('Scalapack redistribute') for S_mM, S_mm in zip(S_qmM, S_qmm): self.mM2mm.redistribute(S_mM, S_mm) if add_hermitian_conjugate: if blockdesc.active: pblas_tran(1.0, S_mm.copy(), 1.0, S_mm, blockdesc, blockdesc) self.timer.stop('Scalapack redistribute') return S_qmm.reshape(xshape + blockdesc.shape) def get_overlap_matrix_shape(self): return self.mmdescriptor.shape def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]).conj() else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='T') return rho_mm def calculate_density_matrix(self, f_n, C_nM, rho_mM=None): """Calculate density matrix from occupations and coefficients. Presently this function performs the usual scalapack 3-step trick: redistribute-numbercrunching-backdistribute. Notes on future performance improvement. As per the current framework, C_nM exists as copies on each domain, i.e. this is not parallel over domains. We'd like to correct this and have an efficient distribution using e.g. the block communicator. The diagonalization routine and other parts of the code should however be changed to accommodate the following scheme: Keep coefficients in C_mm form after the diagonalization. rho_mm can then be directly calculated from C_mm without redistribution, after which we only need to redistribute rho_mm across domains. """ dtype = C_nM.dtype rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM) rback = Redistributor(self.block_comm, self.mmdescriptor, self.mM_unique_descriptor) rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype) rback.redistribute(rho_mm, rho1_mM) del rho_mm if rho_mM is None: if self.gd.rank == 0: rho_mM = rho1_mM else: rho_mM = self.mMdescriptor.zeros(dtype=dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM def distribute_to_columns(self, rho_mm, srcdescriptor): redistributor = Redistributor( self.block_comm, # XXX srcdescriptor, self.mM_unique_descriptor) rho_mM = redistributor.redistribute(rho_mm) if self.gd.rank != 0: rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM def oldcalculate_density_matrix(self, f_n, C_nM, rho_mM=None): # This version is parallel over the band descriptor only. # This is inefficient, but let's keep it for a while in case # there's trouble with the more efficient version if rho_mM is None: rho_mM = self.mMdescriptor.zeros(dtype=C_nM.dtype) Cf_nM = (C_nM * f_n[:, None]).conj() pblas_simple_gemm(self.nMdescriptor, self.nMdescriptor, self.mMdescriptor, Cf_nM, C_nM, rho_mM, transa='T') return rho_mM def get_transposed_density_matrix(self, f_n, C_nM, rho_mM=None): return self.calculate_density_matrix(f_n, C_nM, rho_mM).conj() def get_description(self): (title, template) = BlacsLayouts.get_description(self) bg = self.blockgrid desc = self.mmdescriptor s = template % (bg.nprow, bg.npcol, desc.mb, desc.nb) return ' '.join([title, s])
class BlacsBandLayouts(BlacsLayouts): #XXX should derive from BandLayouts too! """ScaLAPACK Dense Linear Algebra. This class is instantiated in the real-space code. Not for casual use, at least for now. Requires two distributors and three descriptors for initialization as well as grid descriptors and band descriptors. Distributors are for cols2blocks (1D -> 2D BLACS grid) and blocks2rows (2D -> 1D BLACS grid). ScaLAPACK operations must occur on a 2D BLACS grid for performance and scalability. Redistribute of 1D *column* layout matrix will operate only on lower half of H or S. Redistribute of 2D block will operate on entire matrix for U, but only lower half of C. inverse_cholesky is "hard-coded" for real-space code. Expects overlap matrix (S) and the coefficient matrix (C) to be a replicated data structures and *not* created by the BLACS descriptor class. This is due to the MPI_Reduce and MPI_Broadcast that will occur in the parallel matrix multiply. Input matrices should be: S = np.empty((nbands, mybands), dtype) C = np.empty((mybands, nbands), dtype) _standard_diagonalize is "hard-coded" for the real-space code. Expects both hamiltonian (H) and eigenvector matrix (U) to be a replicated data structures and not created by the BLACS descriptor class. This is due to the MPI_Reduce and MPI_Broadcast that will occur in the parallel matrix multiply. Input matrices should be: H = np.empty((nbands, mynbands), dtype) U = np.empty((mynbands, nbands), dtype) eps_n = np.empty(mynbands, dtype = float) """ #XXX rewrite this docstring a bit! matrix_descriptor_class = BlacsBandMatrixDescriptor # This class 'describes' all the realspace Blacs-related layouts def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, buffer_size=None, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) self.buffer_size = buffer_size nbands = bd.nbands self.mynbands = mynbands = bd.mynbands self.blocksize = blocksize # 1D layout - columns self.columngrid = BlacsGrid(self.column_comm, 1, bd.comm.size) self.Nndescriptor = self.columngrid.new_descriptor( nbands, nbands, nbands, mynbands) # 2D layout self.nndescriptor = self.blockgrid.new_descriptor( nbands, nbands, blocksize, blocksize) # 1D layout - rows self.rowgrid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.nNdescriptor = self.rowgrid.new_descriptor( nbands, nbands, mynbands, nbands) # Only redistribute filled out half for Hermitian matrices self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor, self.nndescriptor) #self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor, # self.nndescriptor, 'L') #XXX faster but... # Resulting matrix will be used in dgemm which is symmetry obvlious self.nn2nN = Redistributor(self.block_comm, self.nndescriptor, self.nNdescriptor) def diagonalize(self, H_nn, eps_n): nbands = self.bd.nbands eps_N = np.empty(nbands) self.timer.start('Diagonalize') self._diagonalize(H_nn, eps_N) self.timer.stop('Diagonalize') self.timer.start('Distribute results') # eps_N is already on block_comm.rank = 0 # easier to broadcast eps_N to all and # get the correct slice afterward. self.block_comm.broadcast(eps_N, 0) eps_n[:] = eps_N[self.bd.get_slice()] self.timer.stop('Distribute results') def _diagonalize(self, H_nn, eps_N): """Parallel diagonalizer.""" self.nndescriptor.diagonalize_dc(H_nn.copy(), H_nn, eps_N, 'L') def inverse_cholesky(self, S_nn): self.timer.start('Inverse Cholesky') self._inverse_cholesky(S_nn) self.block_comm.barrier( ) # removing barrier may lead to race condition self.timer.stop('Inverse Cholesky') def _inverse_cholesky(self, S_nn): self.nndescriptor.inverse_cholesky(S_nn, 'L') def get_description(self): (title, template) = BlacsLayouts.get_description(self) bg = self.blockgrid desc = self.nndescriptor s = template % (bg.nprow, bg.npcol, desc.mb, desc.nb) return ' '.join([title, s])
def calculate_forces(self, hamiltonian, F_av): self.timer.start('LCAO forces') spos_ac = self.tci.atoms.get_scaled_positions() % 1.0 ksl = self.ksl nao = ksl.nao mynao = ksl.mynao nq = len(self.kd.ibzk_qc) dtype = self.dtype tci = self.tci gd = self.gd bfs = self.basis_functions Mstart = ksl.Mstart Mstop = ksl.Mstop from gpaw.kohnsham_layouts import BlacsOrbitalLayouts isblacs = isinstance(ksl, BlacsOrbitalLayouts) # XXX if not isblacs: self.timer.start('TCI derivative') dThetadR_qvMM = np.empty((nq, 3, mynao, nao), dtype) dTdR_qvMM = np.empty((nq, 3, mynao, nao), dtype) dPdR_aqvMi = {} for a in self.basis_functions.my_atom_indices: ni = self.setups[a].ni dPdR_aqvMi[a] = np.empty((nq, 3, nao, ni), dtype) tci.calculate_derivative(spos_ac, dThetadR_qvMM, dTdR_qvMM, dPdR_aqvMi) gd.comm.sum(dThetadR_qvMM) gd.comm.sum(dTdR_qvMM) self.timer.stop('TCI derivative') my_atom_indices = bfs.my_atom_indices atom_indices = bfs.atom_indices def _slices(indices): for a in indices: M1 = bfs.M_a[a] - Mstart M2 = M1 + self.setups[a].nao if M2 > 0: yield a, max(0, M1), M2 def slices(): return _slices(atom_indices) def my_slices(): return _slices(my_atom_indices) # # ----- ----- # \ -1 \ * # E = ) S H rho = ) c eps f c # mu nu / mu x x z z nu / n mu n n n nu # ----- ----- # x z n # # We use the transpose of that matrix. The first form is used # if rho is given, otherwise the coefficients are used. self.timer.start('Initial') rhoT_uMM = [] ET_uMM = [] if not isblacs: if self.kpt_u[0].rho_MM is None: self.timer.start('Get density matrix') for kpt in self.kpt_u: rhoT_MM = ksl.get_transposed_density_matrix( kpt.f_n, kpt.C_nM) rhoT_uMM.append(rhoT_MM) ET_MM = ksl.get_transposed_density_matrix( kpt.f_n * kpt.eps_n, kpt.C_nM) ET_uMM.append(ET_MM) if hasattr(kpt, 'c_on'): # XXX does this work with BLACS/non-BLACS/etc.? assert self.bd.comm.size == 1 d_nn = np.zeros((self.bd.mynbands, self.bd.mynbands), dtype=kpt.C_nM.dtype) for ne, c_n in zip(kpt.ne_o, kpt.c_on): d_nn += ne * np.outer(c_n.conj(), c_n) rhoT_MM += ksl.get_transposed_density_matrix_delta(\ d_nn, kpt.C_nM) ET_MM += ksl.get_transposed_density_matrix_delta(\ d_nn * kpt.eps_n, kpt.C_nM) self.timer.stop('Get density matrix') else: rhoT_uMM = [] ET_uMM = [] for kpt in self.kpt_u: H_MM = self.eigensolver.calculate_hamiltonian_matrix(\ hamiltonian, self, kpt) tri2full(H_MM) S_MM = kpt.S_MM.copy() tri2full(S_MM) ET_MM = np.linalg.solve(S_MM, gemmdot(H_MM, kpt.rho_MM)).T.copy() del S_MM, H_MM rhoT_MM = kpt.rho_MM.T.copy() rhoT_uMM.append(rhoT_MM) ET_uMM.append(ET_MM) self.timer.stop('Initial') if isblacs: # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX from gpaw.blacs import BlacsGrid, Redistributor def get_density_matrix(f_n, C_nM, redistributor): rho1_mm = ksl.calculate_blocked_density_matrix(f_n, C_nM).conj() rho_mm = redistributor.redistribute(rho1_mm) return rho_mm pcutoff_a = [ max([pt.get_cutoff() for pt in setup.pt_j]) for setup in self.setups ] phicutoff_a = [ max([phit.get_cutoff() for phit in setup.phit_j]) for setup in self.setups ] # XXX should probably use bdsize x gdsize instead # That would be consistent with some existing grids grid = BlacsGrid(ksl.block_comm, self.gd.comm.size, self.bd.comm.size) blocksize1 = -(-nao // grid.nprow) blocksize2 = -(-nao // grid.npcol) # XXX what are rows and columns actually? desc = grid.new_descriptor(nao, nao, blocksize1, blocksize2) rhoT_umm = [] ET_umm = [] redistributor = Redistributor(grid.comm, ksl.mmdescriptor, desc) Fpot_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): self.timer.start('Get density matrix') rhoT_mm = get_density_matrix(kpt.f_n, kpt.C_nM, redistributor) rhoT_umm.append(rhoT_mm) self.timer.stop('Get density matrix') self.timer.start('Potential') rhoT_mM = ksl.distribute_to_columns(rhoT_mm, desc) vt_G = hamiltonian.vt_sG[kpt.s] Fpot_av += bfs.calculate_force_contribution( vt_G, rhoT_mM, kpt.q) del rhoT_mM self.timer.stop('Potential') self.timer.start('Get density matrix') for kpt in self.kpt_u: ET_mm = get_density_matrix(kpt.f_n * kpt.eps_n, kpt.C_nM, redistributor) ET_umm.append(ET_mm) self.timer.stop('Get density matrix') M1start = blocksize1 * grid.myrow M2start = blocksize2 * grid.mycol M1stop = min(M1start + blocksize1, nao) M2stop = min(M2start + blocksize2, nao) m1max = M1stop - M1start m2max = M2stop - M2start if not isblacs: # Kinetic energy contribution # # ----- d T # a \ mu nu # F += 2 Re ) -------- rho # / d R nu mu # ----- mu nu # mu in a; nu # Fkin_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): dEdTrhoT_vMM = (dTdR_qvMM[kpt.q] * rhoT_uMM[u][np.newaxis]).real for a, M1, M2 in my_slices(): Fkin_av[a, :] += \ 2.0 * dEdTrhoT_vMM[:, M1:M2].sum(-1).sum(-1) del dEdTrhoT_vMM # Density matrix contribution due to basis overlap # # ----- d Theta # a \ mu nu # F += -2 Re ) ------------ E # / d R nu mu # ----- mu nu # mu in a; nu # Ftheta_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): dThetadRE_vMM = (dThetadR_qvMM[kpt.q] * ET_uMM[u][np.newaxis]).real for a, M1, M2 in my_slices(): Ftheta_av[a, :] += \ -2.0 * dThetadRE_vMM[:, M1:M2].sum(-1).sum(-1) del dThetadRE_vMM if isblacs: from gpaw.lcao.overlap import TwoCenterIntegralCalculator self.timer.start('Prepare TCI loop') M_a = bfs.M_a Fkin2_av = np.zeros_like(F_av) Ftheta2_av = np.zeros_like(F_av) cell_cv = tci.atoms.cell spos_ac = tci.atoms.get_scaled_positions() % 1.0 overlapcalc = TwoCenterIntegralCalculator(self.kd.ibzk_qc, derivative=False) # XXX this is not parallel *AT ALL*. self.timer.start('Get neighbors') nl = tci.atompairs.pairs.neighbors r_and_offset_aao = get_r_and_offsets(nl, spos_ac, cell_cv) atompairs = r_and_offset_aao.keys() atompairs.sort() self.timer.stop('Get neighbors') T_expansions = tci.T_expansions Theta_expansions = tci.Theta_expansions P_expansions = tci.P_expansions nq = len(self.kd.ibzk_qc) dH_asp = hamiltonian.dH_asp self.timer.start('broadcast dH') alldH_asp = {} for a in range(len(self.setups)): gdrank = bfs.sphere_a[a].rank if gdrank == gd.rank: dH_sp = dH_asp[a] else: ni = self.setups[a].ni dH_sp = np.empty((self.nspins, ni * (ni + 1) // 2)) gd.comm.broadcast(dH_sp, gdrank) # okay, now everyone gets copies of dH_sp alldH_asp[a] = dH_sp self.timer.stop('broadcast dH') # This will get sort of hairy. We need to account for some # three-center overlaps, such as: # # a1 # Phi ~a3 a3 ~a3 a2 a2,a1 # < ---- |p > dH <p |Phi > rho # dR # # To this end we will loop over all pairs of atoms (a1, a3), # and then a sub-loop over (a3, a2). from gpaw.lcao.overlap import DerivativeAtomicDisplacement class Displacement(DerivativeAtomicDisplacement): def __init__(self, a1, a2, R_c, offset): phases = overlapcalc.phaseclass(overlapcalc.ibzk_qc, offset) DerivativeAtomicDisplacement.__init__( self, None, a1, a2, R_c, offset, phases) # Cache of Displacement objects with spherical harmonics with # evaluated spherical harmonics. disp_aao = {} def get_displacements(a1, a2, maxdistance): # XXX the way maxdistance is handled it can lead to # bad caching when different maxdistances are passed # to subsequent calls with same pair of atoms disp_o = disp_aao.get((a1, a2)) if disp_o is None: disp_o = [] for R_c, offset in r_and_offset_aao[(a1, a2)]: if np.linalg.norm(R_c) > maxdistance: continue disp = Displacement(a1, a2, R_c, offset) disp_o.append(disp) disp_aao[(a1, a2)] = disp_o return [disp for disp in disp_o if disp.r < maxdistance] self.timer.stop('Prepare TCI loop') self.timer.start('Not so complicated loop') for (a1, a2) in atompairs: if a1 >= a2: # Actually this leads to bad load balance. # We should take a1 > a2 or a1 < a2 equally many times. # Maybe decide which of these choices # depending on whether a2 % 1 == 0 continue m1start = M_a[a1] - M1start m2start = M_a[a2] - M2start if m1start >= blocksize1 or m2start >= blocksize2: continue # (we have only one block per CPU) T_expansion = T_expansions.get(a1, a2) Theta_expansion = Theta_expansions.get(a1, a2) #P_expansion = P_expansions.get(a1, a2) nm1, nm2 = T_expansion.shape m1stop = min(m1start + nm1, m1max) m2stop = min(m2start + nm2, m2max) if m1stop <= 0 or m2stop <= 0: continue m1start = max(m1start, 0) m2start = max(m2start, 0) J1start = max(0, M1start - M_a[a1]) J2start = max(0, M2start - M_a[a2]) M1stop = J1start + m1stop - m1start J2stop = J2start + m2stop - m2start dTdR_qvmm = T_expansion.zeros((nq, 3), dtype=dtype) dThetadR_qvmm = Theta_expansion.zeros((nq, 3), dtype=dtype) disp_o = get_displacements(a1, a2, phicutoff_a[a1] + phicutoff_a[a2]) for disp in disp_o: disp.evaluate_overlap(T_expansion, dTdR_qvmm) disp.evaluate_overlap(Theta_expansion, dThetadR_qvmm) for u, kpt in enumerate(self.kpt_u): rhoT_mm = rhoT_umm[u][m1start:m1stop, m2start:m2stop] ET_mm = ET_umm[u][m1start:m1stop, m2start:m2stop] Fkin_v = 2.0 * ( dTdR_qvmm[kpt.q][:, J1start:M1stop, J2start:J2stop] * rhoT_mm[np.newaxis]).real.sum(-1).sum(-1) Ftheta_v = 2.0 * (dThetadR_qvmm[kpt.q][:, J1start:M1stop, J2start:J2stop] * ET_mm[np.newaxis]).real.sum(-1).sum(-1) Fkin2_av[a1] += Fkin_v Fkin2_av[a2] -= Fkin_v Ftheta2_av[a1] -= Ftheta_v Ftheta2_av[a2] += Ftheta_v Fkin_av = Fkin2_av Ftheta_av = Ftheta2_av self.timer.stop('Not so complicated loop') dHP_and_dSP_aauim = {} a2values = {} for (a2, a3) in atompairs: if not a3 in a2values: a2values[a3] = [] a2values[a3].append(a2) Fatom_av = np.zeros_like(F_av) Frho_av = np.zeros_like(F_av) self.timer.start('Complicated loop') for a1, a3 in atompairs: if a1 == a3: # Functions reside on same atom, so their overlap # does not change when atom is displaced continue m1start = M_a[a1] - M1start if m1start >= blocksize1: continue P_expansion = P_expansions.get(a1, a3) nm1 = P_expansion.shape[0] m1stop = min(m1start + nm1, m1max) if m1stop <= 0: continue m1start = max(m1start, 0) J1start = max(0, M1start - M_a[a1]) J1stop = J1start + m1stop - m1start disp_o = get_displacements(a1, a3, phicutoff_a[a1] + pcutoff_a[a3]) if len(disp_o) == 0: continue dPdR_qvmi = P_expansion.zeros((nq, 3), dtype=dtype) for disp in disp_o: disp.evaluate_overlap(P_expansion, dPdR_qvmi) dPdR_qvmi = dPdR_qvmi[:, :, J1start:J1stop, :].copy() for a2 in a2values[a3]: m2start = M_a[a2] - M2start if m2start >= blocksize2: continue P_expansion2 = P_expansions.get(a2, a3) nm2 = P_expansion2.shape[0] m2stop = min(m2start + nm2, m2max) if m2stop <= 0: continue disp_o = get_displacements(a2, a3, phicutoff_a[a2] + pcutoff_a[a3]) if len(disp_o) == 0: continue m2start = max(m2start, 0) J2start = max(0, M2start - M_a[a2]) J2stop = J2start + m2stop - m2start if (a2, a3) in dHP_and_dSP_aauim: dHP_uim, dSP_uim = dHP_and_dSP_aauim[(a2, a3)] else: P_qmi = P_expansion2.zeros((nq, ), dtype=dtype) for disp in disp_o: disp.evaluate_direct(P_expansion2, P_qmi) P_qmi = P_qmi[:, J2start:J2stop].copy() dH_sp = alldH_asp[a3] dS_ii = self.setups[a3].dO_ii dHP_uim = [] dSP_uim = [] for u, kpt in enumerate(self.kpt_u): dH_ii = unpack(dH_sp[kpt.s]) dHP_im = np.dot(P_qmi[kpt.q], dH_ii).T.conj() # XXX only need nq of these dSP_im = np.dot(P_qmi[kpt.q], dS_ii).T.conj() dHP_uim.append(dHP_im) dSP_uim.append(dSP_im) dHP_and_dSP_aauim[(a2, a3)] = dHP_uim, dSP_uim for u, kpt in enumerate(self.kpt_u): rhoT_mm = rhoT_umm[u][m1start:m1stop, m2start:m2stop] ET_mm = ET_umm[u][m1start:m1stop, m2start:m2stop] dPdRdHP_vmm = np.dot(dPdR_qvmi[kpt.q], dHP_uim[u]) dPdRdSP_vmm = np.dot(dPdR_qvmi[kpt.q], dSP_uim[u]) Fatom_c = 2.0 * (dPdRdHP_vmm * rhoT_mm).real.sum(-1).sum(-1) Frho_c = 2.0 * (dPdRdSP_vmm * ET_mm).real.sum(-1).sum(-1) Fatom_av[a1] += Fatom_c Fatom_av[a3] -= Fatom_c Frho_av[a1] -= Frho_c Frho_av[a3] += Frho_c self.timer.stop('Complicated loop') if not isblacs: # Potential contribution # # ----- / d Phi (r) # a \ | mu ~ # F += -2 Re ) | ---------- v (r) Phi (r) dr rho # / | d R nu nu mu # ----- / a # mu in a; nu # self.timer.start('Potential') Fpot_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): vt_G = hamiltonian.vt_sG[kpt.s] Fpot_av += bfs.calculate_force_contribution( vt_G, rhoT_uMM[u], kpt.q) self.timer.stop('Potential') # Density matrix contribution from PAW correction # # ----- ----- # a \ a \ b # F += 2 Re ) Z E - 2 Re ) Z E # / mu nu nu mu / mu nu nu mu # ----- ----- # mu nu b; mu in a; nu # # with # b* # ----- dP # b \ i mu b b # Z = ) -------- dS P # mu nu / dR ij j nu # ----- b mu # ij # self.timer.start('Paw correction') Frho_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): work_MM = np.zeros((mynao, nao), dtype) ZE_MM = None for b in my_atom_indices: setup = self.setups[b] dO_ii = np.asarray(setup.dO_ii, dtype) dOP_iM = np.zeros((setup.ni, nao), dtype) gemm(1.0, self.P_aqMi[b][kpt.q], dO_ii, 0.0, dOP_iM, 'c') for v in range(3): gemm(1.0, dOP_iM, dPdR_aqvMi[b][kpt.q][v][Mstart:Mstop], 0.0, work_MM, 'n') ZE_MM = (work_MM * ET_uMM[u]).real for a, M1, M2 in slices(): dE = 2 * ZE_MM[M1:M2].sum() Frho_av[a, v] -= dE # the "b; mu in a; nu" term Frho_av[b, v] += dE # the "mu nu" term del work_MM, ZE_MM self.timer.stop('Paw correction') # Atomic density contribution # ----- ----- # a \ a \ b # F += -2 Re ) A rho + 2 Re ) A rho # / mu nu nu mu / mu nu nu mu # ----- ----- # mu nu b; mu in a; nu # # b* # ----- d P # b \ i mu b b # A = ) ------- dH P # mu nu / d R ij j nu # ----- b mu # ij # self.timer.start('Atomic Hamiltonian force') Fatom_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): for b in my_atom_indices: H_ii = np.asarray(unpack(hamiltonian.dH_asp[b][kpt.s]), dtype) HP_iM = gemmdot( H_ii, np.ascontiguousarray(self.P_aqMi[b][kpt.q].T.conj())) for v in range(3): dPdR_Mi = dPdR_aqvMi[b][kpt.q][v][Mstart:Mstop] ArhoT_MM = (gemmdot(dPdR_Mi, HP_iM) * rhoT_uMM[u]).real for a, M1, M2 in slices(): dE = 2 * ArhoT_MM[M1:M2].sum() Fatom_av[a, v] += dE # the "b; mu in a; nu" term Fatom_av[b, v] -= dE # the "mu nu" term self.timer.stop('Atomic Hamiltonian force') F_av += Fkin_av + Fpot_av + Ftheta_av + Frho_av + Fatom_av self.timer.start('Wait for sum') ksl.orbital_comm.sum(F_av) if self.bd.comm.rank == 0: self.kd.comm.sum(F_av, 0) self.timer.stop('Wait for sum') self.timer.stop('LCAO forces')
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype==complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) #gemm(1.0, A0, A0, 0.0, Z0, transa='t') print A0.shape, Z0.shape Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() print 'gemm err', gemm_err print 'gemv err', gemv_err print 'r2k err' , r2k_err print 'rk_err' , rk_err else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err,0, tol)
from gpaw.mpi import world from gpaw.blacs import BlacsGrid, Redistributor if world.size < 2: raise ValueError("Runs on two or more processors") grid = BlacsGrid(world, 2, world.size // 2) desc = grid.new_descriptor(12, 8, 2, 3) a = desc.zeros() a[:] = world.rank subdesc = grid.new_descriptor(7, 7, 2, 2) b = subdesc.zeros() r = Redistributor(grid.comm, desc, subdesc, uplo="G") ia = 3 ja = 2 ib = 1 jb = 1 M = 4 N = 5 r.redistribute(a, b, M, N, ia, ja, ib, jb) a0 = desc.collect_on_master(a) b0 = subdesc.collect_on_master(b) if world.rank == 0: print a0
def main(N=73, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype==complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: glob = grid.new_descriptor(N, N, N, N) # print globA.asarray() # Populate matrices local to master: H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) C0 = glob.empty(dtype=dtype) if rank == 0: # Complex case must have real numbers on the diagonal. # We make a simple complex Hermitian matrix below. H0 = H0 + epsilon * (0.1*np.tri(N, N, k= -N // nprocs) + 0.3*np.tri(N, N, k=-1)) S0 = S0 + epsilon * (0.2*np.tri(N, N, k= -N // nprocs) + 0.4*np.tri(N, N, k=-1)) # Make matrices symmetric rk(1.0, H0.copy(), 0.0, H0) rk(1.0, S0.copy(), 0.0, S0) # Overlap matrix must be semi-positive definite S0 = S0 + 50.0*np.eye(N, N, 0) # Hamiltonian is usually diagonally dominant H0 = H0 + 75.0*np.eye(N, N, 0) C0 = S0.copy() # Local result matrices W0 = np.empty((N),dtype=float) W0_g = np.empty((N),dtype=float) # Calculate eigenvalues if rank == 0: diagonalize(H0.copy(), W0) general_diagonalize(H0.copy(), W0_g, S0.copy()) inverse_cholesky(C0) # result returned in lower triangle # tri2full(C0) # symmetrize assert glob.check(H0) and glob.check(S0) and glob.check(C0) # Create distributed destriptors with various block sizes: dist = grid.new_descriptor(N, N, 8, 8) # Distributed matrices: # We can use empty here, but end up with garbage on # on the other half of the triangle when we redistribute. # This is fine because ScaLAPACK does not care. H = dist.empty(dtype=dtype) S = dist.empty(dtype=dtype) Z = dist.empty(dtype=dtype) C = dist.empty(dtype=dtype) # Eigenvalues are non-BLACS matrices W = np.empty((N), dtype=float) W_dc = np.empty((N), dtype=float) W_mr3 = np.empty((N), dtype=float) W_g = np.empty((N), dtype=float) W_g_dc = np.empty((N), dtype=float) W_g_mr3 = np.empty((N), dtype=float) Glob2dist = Redistributor(world, glob, dist) Glob2dist.redistribute(H0, H, uplo='L') Glob2dist.redistribute(S0, S, uplo='L') Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten # we don't test the expert drivers anymore since there # might be a buffer overflow error ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L') scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L') ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L') ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L') scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L') ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L') scalapack_inverse_cholesky(dist, C, 'L') # Undo redistribute C_test = glob.empty(dtype=dtype) Dist2glob = Redistributor(world, dist, glob) Dist2glob.redistribute(C, C_test) if rank == 0: ## diag_ex_err = abs(W - W0).max() diag_dc_err = abs(W_dc - W0).max() ## diag_mr3_err = abs(W_mr3 - W0).max() ## general_diag_ex_err = abs(W_g - W0_g).max() general_diag_dc_err = abs(W_g_dc - W0_g).max() ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max() inverse_chol_err = abs(C_test-C0).max() ## print 'diagonalize ex err', diag_ex_err print 'diagonalize dc err', diag_dc_err ## print 'diagonalize mr3 err', diag_mr3_err ## print 'general diagonalize ex err', general_diag_ex_err print 'general diagonalize dc err', general_diag_dc_err ## print 'general diagonalize mr3 err', general_diag_mr3_err print 'inverse chol err', inverse_chol_err else: ## diag_ex_err = 0.0 diag_dc_err = 0.0 ## diag_mr3_err = 0.0 ## general_diag_ex_err = 0.0 general_diag_dc_err = 0.0 ## general_diag_mr3_err = 0.0 inverse_chol_err = 0.0 # We don't like exceptions on only one cpu ## diag_ex_err = world.sum(diag_ex_err) diag_dc_err = world.sum(diag_dc_err) ## diag_mr3_err = world.sum(diag_mr3_err) ## general_diag_ex_err = world.sum(general_diag_ex_err) general_diag_dc_err = world.sum(general_diag_dc_err) ## general_diag_mr3_err = world.sum(general_diag_mr3_err) inverse_chol_err = world.sum(inverse_chol_err) ## assert diag_ex_err < tol assert diag_dc_err < tol ## assert diag_mr3_err < tol ## assert general_diag_ex_err < tol assert general_diag_dc_err < tol ## assert general_diag_mr3_err < tol assert inverse_chol_err < tol
# in trunk/gpaw/blacs.py for some discussions of # these idiosyncracies. import numpy as np from gpaw.blacs import BlacsGrid, parallelprint from gpaw.mpi import world from gpaw.utilities.scalapack import pblas_simple_gemm gen = np.random.RandomState(42) # simulate state-parallelization=2 and # domain-decomposition.prod=32 B = 2 D = 32 mb = 32 grid = BlacsGrid(world, B, D) nbands = 500 nG = 80**3 nGdesc = grid.new_descriptor(nbands, nG, nbands // B, nG // D) nndesc = grid.new_descriptor(nbands, nbands, mb, mb) psit_nG = gen.rand(*nGdesc.shape) A_nn = gen.rand(*nndesc.shape) assert nGdesc.check(psit_nG) assert nndesc.check(A_nn) parallelprint(world, (A_nn.shape, nndesc.shape, nndesc.lld))
def calculate_forces(self, hamiltonian, F_av): self.timer.start('LCAO forces') spos_ac = self.tci.atoms.get_scaled_positions() % 1.0 ksl = self.ksl nao = ksl.nao mynao = ksl.mynao nq = len(self.kd.ibzk_qc) dtype = self.dtype tci = self.tci gd = self.gd bfs = self.basis_functions Mstart = ksl.Mstart Mstop = ksl.Mstop from gpaw.kohnsham_layouts import BlacsOrbitalLayouts isblacs = isinstance(ksl, BlacsOrbitalLayouts) # XXX if not isblacs: self.timer.start('TCI derivative') dThetadR_qvMM = np.empty((nq, 3, mynao, nao), dtype) dTdR_qvMM = np.empty((nq, 3, mynao, nao), dtype) dPdR_aqvMi = {} for a in self.basis_functions.my_atom_indices: ni = self.setups[a].ni dPdR_aqvMi[a] = np.empty((nq, 3, nao, ni), dtype) tci.calculate_derivative(spos_ac, dThetadR_qvMM, dTdR_qvMM, dPdR_aqvMi) gd.comm.sum(dThetadR_qvMM) gd.comm.sum(dTdR_qvMM) self.timer.stop('TCI derivative') my_atom_indices = bfs.my_atom_indices atom_indices = bfs.atom_indices def _slices(indices): for a in indices: M1 = bfs.M_a[a] - Mstart M2 = M1 + self.setups[a].nao if M2 > 0: yield a, max(0, M1), M2 def slices(): return _slices(atom_indices) def my_slices(): return _slices(my_atom_indices) # # ----- ----- # \ -1 \ * # E = ) S H rho = ) c eps f c # mu nu / mu x x z z nu / n mu n n n nu # ----- ----- # x z n # # We use the transpose of that matrix. The first form is used # if rho is given, otherwise the coefficients are used. self.timer.start('Initial') rhoT_uMM = [] ET_uMM = [] if not isblacs: if self.kpt_u[0].rho_MM is None: self.timer.start('Get density matrix') for kpt in self.kpt_u: rhoT_MM = ksl.get_transposed_density_matrix(kpt.f_n, kpt.C_nM) rhoT_uMM.append(rhoT_MM) ET_MM = ksl.get_transposed_density_matrix(kpt.f_n * kpt.eps_n, kpt.C_nM) ET_uMM.append(ET_MM) if hasattr(kpt, 'c_on'): # XXX does this work with BLACS/non-BLACS/etc.? assert self.bd.comm.size == 1 d_nn = np.zeros((self.bd.mynbands, self.bd.mynbands), dtype=kpt.C_nM.dtype) for ne, c_n in zip(kpt.ne_o, kpt.c_on): d_nn += ne * np.outer(c_n.conj(), c_n) rhoT_MM += ksl.get_transposed_density_matrix_delta(d_nn, kpt.C_nM) ET_MM += ksl.get_transposed_density_matrix_delta(d_nn * kpt.eps_n, kpt.C_nM) self.timer.stop('Get density matrix') else: rhoT_uMM = [] ET_uMM = [] for kpt in self.kpt_u: H_MM = self.eigensolver.calculate_hamiltonian_matrix(hamiltonian, self, kpt) tri2full(H_MM) S_MM = kpt.S_MM.copy() tri2full(S_MM) ET_MM = np.linalg.solve(S_MM, gemmdot(H_MM, kpt.rho_MM)).T.copy() del S_MM, H_MM rhoT_MM = kpt.rho_MM.T.copy() rhoT_uMM.append(rhoT_MM) ET_uMM.append(ET_MM) self.timer.stop('Initial') if isblacs: # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX from gpaw.blacs import BlacsGrid, Redistributor def get_density_matrix(f_n, C_nM, redistributor): rho1_mm = ksl.calculate_blocked_density_matrix(f_n, C_nM).conj() rho_mm = redistributor.redistribute(rho1_mm) return rho_mm pcutoff_a = [max([pt.get_cutoff() for pt in setup.pt_j]) for setup in self.setups] phicutoff_a = [max([phit.get_cutoff() for phit in setup.phit_j]) for setup in self.setups] # XXX should probably use bdsize x gdsize instead # That would be consistent with some existing grids grid = BlacsGrid(ksl.block_comm, self.gd.comm.size, self.bd.comm.size) blocksize1 = -(-nao // grid.nprow) blocksize2 = -(-nao // grid.npcol) # XXX what are rows and columns actually? desc = grid.new_descriptor(nao, nao, blocksize1, blocksize2) rhoT_umm = [] ET_umm = [] redistributor = Redistributor(grid.comm, ksl.mmdescriptor, desc) Fpot_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): self.timer.start('Get density matrix') rhoT_mm = get_density_matrix(kpt.f_n, kpt.C_nM, redistributor) rhoT_umm.append(rhoT_mm) self.timer.stop('Get density matrix') self.timer.start('Potential') rhoT_mM = ksl.distribute_to_columns(rhoT_mm, desc) vt_G = hamiltonian.vt_sG[kpt.s] Fpot_av += bfs.calculate_force_contribution(vt_G, rhoT_mM, kpt.q) del rhoT_mM self.timer.stop('Potential') self.timer.start('Get density matrix') for kpt in self.kpt_u: ET_mm = get_density_matrix(kpt.f_n * kpt.eps_n, kpt.C_nM, redistributor) ET_umm.append(ET_mm) self.timer.stop('Get density matrix') M1start = blocksize1 * grid.myrow M2start = blocksize2 * grid.mycol M1stop = min(M1start + blocksize1, nao) M2stop = min(M2start + blocksize2, nao) m1max = M1stop - M1start m2max = M2stop - M2start if not isblacs: # Kinetic energy contribution # # ----- d T # a \ mu nu # F += 2 Re ) -------- rho # / d R nu mu # ----- mu nu # mu in a; nu # Fkin_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): dEdTrhoT_vMM = (dTdR_qvMM[kpt.q] * rhoT_uMM[u][np.newaxis]).real for a, M1, M2 in my_slices(): Fkin_av[a, :] += 2.0 * dEdTrhoT_vMM[:, M1:M2].sum(-1).sum(-1) del dEdTrhoT_vMM # Density matrix contribution due to basis overlap # # ----- d Theta # a \ mu nu # F += -2 Re ) ------------ E # / d R nu mu # ----- mu nu # mu in a; nu # Ftheta_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): dThetadRE_vMM = (dThetadR_qvMM[kpt.q] * ET_uMM[u][np.newaxis]).real for a, M1, M2 in my_slices(): Ftheta_av[a, :] += -2.0 * dThetadRE_vMM[:, M1:M2].sum(-1).sum(-1) del dThetadRE_vMM if isblacs: from gpaw.lcao.overlap import TwoCenterIntegralCalculator self.timer.start('Prepare TCI loop') M_a = bfs.M_a Fkin2_av = np.zeros_like(F_av) Ftheta2_av = np.zeros_like(F_av) cell_cv = tci.atoms.cell spos_ac = tci.atoms.get_scaled_positions() % 1.0 overlapcalc = TwoCenterIntegralCalculator(self.kd.ibzk_qc, derivative=False) def get_phases(offset): return overlapcalc.phaseclass(overlapcalc.ibzk_qc, offset) # XXX this is not parallel *AT ALL*. self.timer.start('Get neighbors') nl = tci.atompairs.pairs.neighbors r_and_offset_aao = get_r_and_offsets(nl, spos_ac, cell_cv) atompairs = r_and_offset_aao.keys() atompairs.sort() self.timer.stop('Get neighbors') T_expansions = tci.T_expansions Theta_expansions = tci.Theta_expansions P_expansions = tci.P_expansions nq = len(self.ibzk_qc) dH_asp = hamiltonian.dH_asp self.timer.start('broadcast dH') alldH_asp = {} for a in range(len(self.setups)): gdrank = bfs.sphere_a[a].rank if gdrank == gd.rank: dH_sp = dH_asp[a] else: ni = self.setups[a].ni dH_sp = np.empty((self.nspins, ni * (ni + 1) // 2)) gd.comm.broadcast(dH_sp, gdrank) # okay, now everyone gets copies of dH_sp alldH_asp[a] = dH_sp self.timer.stop('broadcast dH') # This will get sort of hairy. We need to account for some # three-center overlaps, such as: # # a1 # Phi ~a3 a3 ~a3 a2 a2,a1 # < ---- |p > dH <p |Phi > rho # dR # # To this end we will loop over all pairs of atoms (a1, a3), # and then a sub-loop over (a3, a2). from gpaw.lcao.overlap import DerivativeAtomicDisplacement class Displacement(DerivativeAtomicDisplacement): def __init__(self, a1, a2, R_c, offset): phases = overlapcalc.phaseclass(overlapcalc.ibzk_qc, offset) DerivativeAtomicDisplacement.__init__(self, None, a1, a2, R_c, offset, phases) # Cache of Displacement objects with spherical harmonics with # evaluated spherical harmonics. disp_aao = {} def get_displacements(a1, a2, maxdistance): # XXX the way maxdistance is handled it can lead to # bad caching when different maxdistances are passed # to subsequent calls with same pair of atoms disp_o = disp_aao.get((a1, a2)) if disp_o is None: disp_o = [] for r, offset in r_and_offset_aao[(a1, a2)]: if np.linalg.norm(r) > maxdistance: continue disp = Displacement(a1, a2, r, offset) disp_o.append(disp) disp_aao[(a1, a2)] = disp_o return [disp for disp in disp_o if disp.r < maxdistance] self.timer.stop('Prepare TCI loop') self.timer.start('Not so complicated loop') for (a1, a2) in atompairs: if a1 >= a2: # Actually this leads to bad load balance. # We should take a1 > a2 or a1 < a2 equally many times. # Maybe decide which of these choices # depending on whether a2 % 1 == 0 continue m1start = M_a[a1] - M1start m2start = M_a[a2] - M2start if m1start >= blocksize1 or m2start >= blocksize2: continue T_expansion = T_expansions.get(a1, a2) Theta_expansion = Theta_expansions.get(a1, a2) P_expansion = P_expansions.get(a1, a2) nm1, nm2 = T_expansion.shape m1stop = min(m1start + nm1, m1max) m2stop = min(m2start + nm2, m2max) if m1stop <= 0 or m2stop <= 0: continue m1start = max(m1start, 0) m2start = max(m2start, 0) J1start = max(0, M1start - M_a[a1]) J2start = max(0, M2start - M_a[a2]) M1stop = J1start + m1stop - m1start J2stop = J2start + m2stop - m2start dTdR_qvmm = T_expansion.zeros((nq, 3), dtype=dtype) dThetadR_qvmm = Theta_expansion.zeros((nq, 3), dtype=dtype) disp_o = get_displacements(a1, a2, phicutoff_a[a1] + phicutoff_a[a2]) for disp in disp_o: disp.evaluate_overlap(T_expansion, dTdR_qvmm) disp.evaluate_overlap(Theta_expansion, dThetadR_qvmm) for u, kpt in enumerate(self.kpt_u): rhoT_mm = rhoT_umm[u][m1start:m1stop, m2start:m2stop] ET_mm = ET_umm[u][m1start:m1stop, m2start:m2stop] Fkin_v = 2.0 * (dTdR_qvmm[kpt.q][:, J1start:M1stop, J2start:J2stop] * rhoT_mm[np.newaxis]).real.sum(-1).sum(-1) Ftheta_v = 2.0 * (dThetadR_qvmm[kpt.q][:, J1start:M1stop, J2start:J2stop] * ET_mm[np.newaxis]).real.sum(-1).sum(-1) Fkin2_av[a1] += Fkin_v Fkin2_av[a2] -= Fkin_v Ftheta2_av[a1] -= Ftheta_v Ftheta2_av[a2] += Ftheta_v Fkin_av = Fkin2_av Ftheta_av = Ftheta2_av self.timer.stop('Not so complicated loop') dHP_and_dSP_aauim = {} a2values = {} for (a2, a3) in atompairs: if not a3 in a2values: a2values[a3] = [] a2values[a3].append(a2) Fatom_av = np.zeros_like(F_av) Frho_av = np.zeros_like(F_av) self.timer.start('Complicated loop') for a1, a3 in atompairs: if a1 == a3: continue m1start = M_a[a1] - M1start if m1start >= blocksize1: continue P_expansion = P_expansions.get(a1, a3) nm1 = P_expansion.shape[0] m1stop = min(m1start + nm1, m1max) if m1stop <= 0: continue m1start = max(m1start, 0) J1start = max(0, M1start - M_a[a1]) J1stop = J1start + m1stop - m1start disp_o = get_displacements(a1, a3, phicutoff_a[a1] + pcutoff_a[a3]) if len(disp_o) == 0: continue dPdR_qvmi = P_expansion.zeros((nq, 3), dtype=dtype) for disp in disp_o: disp.evaluate_overlap(P_expansion, dPdR_qvmi) dPdR_qvmi = dPdR_qvmi[:, :, J1start:J1stop, :].copy() for a2 in a2values[a3]: m2start = M_a[a2] - M2start if m2start >= blocksize2: continue P_expansion2 = P_expansions.get(a2, a3) nm2 = P_expansion2.shape[0] m2stop = min(m2start + nm2, m2max) if m2stop <= 0: continue disp_o = get_displacements(a2, a3, phicutoff_a[a2] + pcutoff_a[a3]) if len(disp_o) == 0: continue m2start = max(m2start, 0) J2start = max(0, M2start - M_a[a2]) J2stop = J2start + m2stop - m2start if (a2, a3) in dHP_and_dSP_aauim: dHP_uim, dSP_uim = dHP_and_dSP_aauim[(a2, a3)] else: P_qmi = P_expansion2.zeros((nq,), dtype=dtype) for disp in disp_o: disp.evaluate_direct(P_expansion2, P_qmi) P_qmi = P_qmi[:, J2start:J2stop].copy() dH_sp = alldH_asp[a3] dS_ii = self.setups[a3].dO_ii dHP_uim = [] dSP_uim = [] for u, kpt in enumerate(self.kpt_u): dH_ii = unpack(dH_sp[kpt.s]) dHP_im = np.dot(P_qmi[kpt.q], dH_ii).T.conj() # XXX only need nq of these dSP_im = np.dot(P_qmi[kpt.q], dS_ii).T.conj() dHP_uim.append(dHP_im) dSP_uim.append(dSP_im) dHP_and_dSP_aauim[(a2, a3)] = dHP_uim, dSP_uim for u, kpt in enumerate(self.kpt_u): rhoT_mm = rhoT_umm[u][m1start:m1stop, m2start:m2stop] ET_mm = ET_umm[u][m1start:m1stop, m2start:m2stop] dPdRdHP_vmm = np.dot(dPdR_qvmi[kpt.q], dHP_uim[u]) dPdRdSP_vmm = np.dot(dPdR_qvmi[kpt.q], dSP_uim[u]) Fatom_c = 2.0 * (dPdRdHP_vmm * rhoT_mm).real.sum(-1).sum(-1) Frho_c = 2.0 * (dPdRdSP_vmm * ET_mm).real.sum(-1).sum(-1) Fatom_av[a1] += Fatom_c Fatom_av[a3] -= Fatom_c Frho_av[a1] -= Frho_c Frho_av[a3] += Frho_c self.timer.stop('Complicated loop') if not isblacs: # Potential contribution # # ----- / d Phi (r) # a \ | mu ~ # F += -2 Re ) | ---------- v (r) Phi (r) dr rho # / | d R nu nu mu # ----- / a # mu in a; nu # self.timer.start('Potential') Fpot_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): vt_G = hamiltonian.vt_sG[kpt.s] Fpot_av += bfs.calculate_force_contribution(vt_G, rhoT_uMM[u], kpt.q) self.timer.stop('Potential') # Density matrix contribution from PAW correction # # ----- ----- # a \ a \ b # F += 2 Re ) Z E - 2 Re ) Z E # / mu nu nu mu / mu nu nu mu # ----- ----- # mu nu b; mu in a; nu # # with # b* # ----- dP # b \ i mu b b # Z = ) -------- dS P # mu nu / dR ij j nu # ----- b mu # ij # self.timer.start('Paw correction') Frho_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): work_MM = np.zeros((mynao, nao), dtype) ZE_MM = None for b in my_atom_indices: setup = self.setups[b] dO_ii = np.asarray(setup.dO_ii, dtype) dOP_iM = np.zeros((setup.ni, nao), dtype) gemm(1.0, self.P_aqMi[b][kpt.q], dO_ii, 0.0, dOP_iM, 'c') for v in range(3): gemm(1.0, dOP_iM, dPdR_aqvMi[b][kpt.q][v][Mstart:Mstop], 0.0, work_MM, 'n') ZE_MM = (work_MM * ET_uMM[u]).real for a, M1, M2 in slices(): dE = 2 * ZE_MM[M1:M2].sum() Frho_av[a, v] -= dE # the "b; mu in a; nu" term Frho_av[b, v] += dE # the "mu nu" term del work_MM, ZE_MM self.timer.stop('Paw correction') # Atomic density contribution # ----- ----- # a \ a \ b # F += -2 Re ) A rho + 2 Re ) A rho # / mu nu nu mu / mu nu nu mu # ----- ----- # mu nu b; mu in a; nu # # b* # ----- d P # b \ i mu b b # A = ) ------- dH P # mu nu / d R ij j nu # ----- b mu # ij # self.timer.start('Atomic Hamiltonian force') Fatom_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): for b in my_atom_indices: H_ii = np.asarray(unpack(hamiltonian.dH_asp[b][kpt.s]), dtype) HP_iM = gemmdot(H_ii, np.ascontiguousarray(self.P_aqMi[b][kpt.q].T.conj())) for v in range(3): dPdR_Mi = dPdR_aqvMi[b][kpt.q][v][Mstart:Mstop] ArhoT_MM = (gemmdot(dPdR_Mi, HP_iM) * rhoT_uMM[u]).real for a, M1, M2 in slices(): dE = 2 * ArhoT_MM[M1:M2].sum() Fatom_av[a, v] += dE # the "b; mu in a; nu" term Fatom_av[b, v] -= dE # the "mu nu" term self.timer.stop('Atomic Hamiltonian force') F_av += Fkin_av + Fpot_av + Ftheta_av + Frho_av + Fatom_av self.timer.start('Wait for sum') ksl.orbital_comm.sum(F_av) if self.bd.comm.rank == 0: self.kpt_comm.sum(F_av, 0) self.timer.stop('Wait for sum') self.timer.stop('LCAO forces')
from gpaw.mpi import world from gpaw.blacs import BlacsGrid, Redistributor if world.size < 2: raise ValueError('Runs on two or more processors') grid = BlacsGrid(world, 2, world.size // 2) desc = grid.new_descriptor(12, 8, 2, 3) a = desc.zeros() a[:] = world.rank subdesc = grid.new_descriptor(7, 7, 2, 2) b = subdesc.zeros() r = Redistributor(grid.comm, desc, subdesc, uplo='G') ia = 3 ja = 2 ib = 1 jb = 1 M = 4 N = 5 r.redistribute(a, b, M, N, ia, ja, ib, jb) a0 = desc.collect_on_master(a) b0 = subdesc.collect_on_master(b) if world.rank == 0: print a0
class LrTDDFPTSolveLayout: """BLACS layouts for distributed TD-DFPT""" def __init__(self, sl_lrtddft, nrows, lr_comms): self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft) self.lr_comms = lr_comms # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # # ----------------------------------------------------------------- # matrix # original grid, ie, how matrix is stored self.orig_matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.lr_comms.dd_comm.size, self.lr_comms.eh_comm.size) # solve grid self.solve_matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # M = rows, N = cols M = nrows * 4 N = nrows * 4 mb = 4 nb = 4 self.orig_matrix_descr = self.orig_matrix_grid.new_descriptor( N, M, nb, mb) bs = self.block_size self.solve_matrix_descr = self.solve_matrix_grid.new_descriptor( N, M, bs, bs) self.matrix_in_redist = Redistributor(self.lr_comms.parent_comm, self.orig_matrix_descr, self.solve_matrix_descr) # ----------------------------------------------------------------- # vector # original grid, ie, how vector is stored self.orig_vector_grid = BlacsGrid( self.lr_comms.parent_comm, 1, (self.lr_comms.dd_comm.size * self.lr_comms.eh_comm.size)) # solve grid #self.solve_vector_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # M = rows, N = cols M = nrows * 4 Nrhs = 1 mb = 4 nb = 1 self.orig_vector_descr = self.orig_vector_grid.new_descriptor( Nrhs, M, nb, mb) bs = self.block_size self.solve_vector_descr = self.solve_matrix_grid.new_descriptor( Nrhs, M, 1, bs) self.vector_in_redist = Redistributor(self.lr_comms.parent_comm, self.orig_vector_descr, self.solve_vector_descr) self.vector_out_redist = Redistributor(self.lr_comms.parent_comm, self.solve_vector_descr, self.orig_vector_descr) def solve(self, A_orig, b_orig): """Solve TD-DFPT equation using Scalapack. """ A_solve = self.solve_matrix_descr.empty(dtype=float) if not self.orig_matrix_descr.blacsgrid.is_active(): A_orig = np.empty((0, 0), dtype=float) self.matrix_in_redist.redistribute(A_orig, A_solve) b_solve = self.solve_vector_descr.empty(dtype=float) if not self.orig_vector_descr.blacsgrid.is_active(): b_orig = np.empty((0, 0), dtype=float) self.vector_in_redist.redistribute(b_orig, b_solve) #if False: # np.set_printoptions(precision=5, suppress=True) # for i in range(self.lr_comms.parent_comm.size): # if ( self.lr_comms.parent_comm.rank == i ): # print 'rank ', i # print A_orig # print A_solve # print # print b_orig # print b_solve # print # print # print self.solve_matrix_descr.asarray() # print self.solve_vector_descr.asarray() # print # print '---' # print # self.lr_comms.parent_comm.barrier() info = 0 if self.solve_matrix_descr.blacsgrid.is_active(): _gpaw.scalapack_solve(A_solve, self.solve_matrix_descr.asarray(), b_solve, self.solve_vector_descr.asarray()) if info != 0: raise RuntimeError('scalapack_solve error: %d' % info) self.vector_out_redist.redistribute(b_solve, b_orig) #if False: # for i in range(self.lr_comms.parent_comm.size): # if ( self.lr_comms.parent_comm.rank == i ): # print 'rank ', i # print A_orig # print A_solve # print # print b_orig # print b_solve # print # print # self.lr_comms.parent_comm.barrier() return b_orig
def __init__(self, sl_lrtddft, nrows, lr_comms): self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft) self.lr_comms = lr_comms # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # # ----------------------------------------------------------------- # matrix # original grid, ie, how matrix is stored self.orig_matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.lr_comms.dd_comm.size, self.lr_comms.eh_comm.size) # solve grid self.solve_matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # M = rows, N = cols M = nrows * 4 N = nrows * 4 mb = 4 nb = 4 self.orig_matrix_descr = self.orig_matrix_grid.new_descriptor( N, M, nb, mb) bs = self.block_size self.solve_matrix_descr = self.solve_matrix_grid.new_descriptor( N, M, bs, bs) self.matrix_in_redist = Redistributor(self.lr_comms.parent_comm, self.orig_matrix_descr, self.solve_matrix_descr) # ----------------------------------------------------------------- # vector # original grid, ie, how vector is stored self.orig_vector_grid = BlacsGrid( self.lr_comms.parent_comm, 1, (self.lr_comms.dd_comm.size * self.lr_comms.eh_comm.size)) # solve grid #self.solve_vector_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # M = rows, N = cols M = nrows * 4 Nrhs = 1 mb = 4 nb = 1 self.orig_vector_descr = self.orig_vector_grid.new_descriptor( Nrhs, M, nb, mb) bs = self.block_size self.solve_vector_descr = self.solve_matrix_grid.new_descriptor( Nrhs, M, 1, bs) self.vector_in_redist = Redistributor(self.lr_comms.parent_comm, self.orig_vector_descr, self.solve_vector_descr) self.vector_out_redist = Redistributor(self.lr_comms.parent_comm, self.solve_vector_descr, self.orig_vector_descr)
class LrDiagonalizeLayout: """BLACS layout for distributed Omega matrix in linear response time-dependet DFT calculations""" def __init__(self, sl_lrtddft, nrows, lr_comms): self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft) self.lr_comms = lr_comms # original grid, ie, how matrix is stored self.matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.lr_comms.dd_comm.size, self.lr_comms.eh_comm.size) # diagonalization grid self.diag_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # # M = rows, N = cols M = nrows N = nrows mb = 1 nb = 1 self.matrix_descr = self.matrix_grid.new_descriptor(N, M, nb, mb) bs = self.block_size self.diag_descr = self.diag_grid.new_descriptor(N, M, bs, bs) self.diag_in_redist = Redistributor(self.lr_comms.parent_comm, self.matrix_descr, self.diag_descr) self.diag_out_redist = Redistributor(self.lr_comms.parent_comm, self.diag_descr, self.matrix_descr) def diagonalize(self, eigenvectors, eigenvalues): """Diagonalize symmetric distributed Casida matrix using Scalapack. Parameters: eigenvectors distributed Casida matrix on input, distributed eigenvectors on output eigenvalues zero array on input, eigenvalues on output """ O_diag = self.diag_descr.empty(dtype=float) if self.matrix_descr.blacsgrid.is_active(): O_orig = eigenvectors else: O_orig = np.empty((0, 0), dtype=float) self.diag_in_redist.redistribute(O_orig, O_diag) #print O_diag self.diag_descr.diagonalize_dc(O_diag.copy(), O_diag, eigenvalues, 'L') self.diag_out_redist.redistribute(O_diag, O_orig) self.lr_comms.parent_comm.broadcast(eigenvalues, 0)
def __init__(self, sl_lrtddft, nkq, dd_comm, eh_comm): mcpus, ncpus, blocksize = tuple(sl_lrtddft) self.world = eh_comm.parent self.dd_comm = dd_comm if self.world is None: self.world = self.dd_comm # All the ranks within domain communicator contain the omega matrix # construct new communicator only on domain masters eh_ranks = np.arange(eh_comm.size) * dd_comm.size self.eh_comm2 = self.world.new_communicator(eh_ranks) self.eh_grid = BlacsGrid(self.eh_comm2, eh_comm.size, 1) self.eh_descr = self.eh_grid.new_descriptor(nkq, nkq, 1, nkq) self.diag_grid = BlacsGrid(self.world, mcpus, ncpus) self.diag_descr = self.diag_grid.new_descriptor( nkq, nkq, blocksize, blocksize) self.redistributor_in = Redistributor(self.world, self.eh_descr, self.diag_descr) self.redistributor_out = Redistributor(self.world, self.diag_descr, self.eh_descr) """ # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # ----------------------------------------------------------------- # M = rows, N = cols M = nkq*4; N = nkq*4; mb = nkq*4; nb = 4; Nrhs = 1 # Matrix, mp=1, np=eh_comm.size self.eh_grid2a = BlacsGrid(self.eh_comm2, eh_comm.size, 1) # Vector, mp=eh_comm.size, np=1 self.eh_grid2b = BlacsGrid(self.eh_comm2, 1, eh_comm.size) self.eh_descr2a = self.eh_grid2a.new_descriptor(N, M, nb, mb) self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N, 1, nb) self.solve_descr2a =self.diag_grid.new_descriptor(N, M, blocksize, blocksize) self.solve_descr2b =self.diag_grid.new_descriptor(Nrhs, N, 1, blocksize) self.redist_solve_in_2a = Redistributor(self.world, self.eh_descr2a, self.solve_descr2a) self.redist_solve_in_2b = Redistributor(self.world, self.eh_descr2b, self.solve_descr2b) self.redist_solve_out_2a = Redistributor(self.world, self.solve_descr2a, self.eh_descr2a) self.redist_solve_out_2b = Redistributor(self.world, self.solve_descr2b, self.eh_descr2b) """ # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # ----------------------------------------------------------------- # M = rows, N = cols M = nkq * 4 N = nkq * 4 mb = 4 nb = 4 Nrhs = 1 # Matrix, mp=1, np=eh_comm.size self.eh_grid2a = BlacsGrid(self.world, dd_comm.size, eh_comm.size) # Vector, mp=eh_comm.size, np=1 self.eh_grid2b = BlacsGrid(self.world, 1, dd_comm.size * eh_comm.size) self.eh_descr2a = self.eh_grid2a.new_descriptor(N, M, nb, mb) self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N, Nrhs, nb) self.solve_descr2a = self.diag_grid.new_descriptor( N, M, blocksize, blocksize) self.solve_descr2b = self.diag_grid.new_descriptor( Nrhs, N, Nrhs, blocksize) self.redist_solve_in_2a = Redistributor(self.world, self.eh_descr2a, self.solve_descr2a) self.redist_solve_in_2b = Redistributor(self.world, self.eh_descr2b, self.solve_descr2b) self.redist_solve_out_2a = Redistributor(self.world, self.solve_descr2a, self.eh_descr2a) self.redist_solve_out_2b = Redistributor(self.world, self.solve_descr2b, self.eh_descr2b)
class LrTDDFTLayouts: """BLACS layout for distributed Omega matrix in linear response time-dependet DFT calculations""" def __init__(self, sl_lrtddft, nkq, dd_comm, eh_comm): mcpus, ncpus, blocksize = tuple(sl_lrtddft) self.world = eh_comm.parent self.dd_comm = dd_comm if self.world is None: self.world = self.dd_comm # All the ranks within domain communicator contain the omega matrix # construct new communicator only on domain masters eh_ranks = np.arange(eh_comm.size) * dd_comm.size self.eh_comm2 = self.world.new_communicator(eh_ranks) self.eh_grid = BlacsGrid(self.eh_comm2, eh_comm.size, 1) self.eh_descr = self.eh_grid.new_descriptor(nkq, nkq, 1, nkq) self.diag_grid = BlacsGrid(self.world, mcpus, ncpus) self.diag_descr = self.diag_grid.new_descriptor( nkq, nkq, blocksize, blocksize) self.redistributor_in = Redistributor(self.world, self.eh_descr, self.diag_descr) self.redistributor_out = Redistributor(self.world, self.diag_descr, self.eh_descr) """ # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # ----------------------------------------------------------------- # M = rows, N = cols M = nkq*4; N = nkq*4; mb = nkq*4; nb = 4; Nrhs = 1 # Matrix, mp=1, np=eh_comm.size self.eh_grid2a = BlacsGrid(self.eh_comm2, eh_comm.size, 1) # Vector, mp=eh_comm.size, np=1 self.eh_grid2b = BlacsGrid(self.eh_comm2, 1, eh_comm.size) self.eh_descr2a = self.eh_grid2a.new_descriptor(N, M, nb, mb) self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N, 1, nb) self.solve_descr2a =self.diag_grid.new_descriptor(N, M, blocksize, blocksize) self.solve_descr2b =self.diag_grid.new_descriptor(Nrhs, N, 1, blocksize) self.redist_solve_in_2a = Redistributor(self.world, self.eh_descr2a, self.solve_descr2a) self.redist_solve_in_2b = Redistributor(self.world, self.eh_descr2b, self.solve_descr2b) self.redist_solve_out_2a = Redistributor(self.world, self.solve_descr2a, self.eh_descr2a) self.redist_solve_out_2b = Redistributor(self.world, self.solve_descr2b, self.eh_descr2b) """ # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # ----------------------------------------------------------------- # M = rows, N = cols M = nkq * 4 N = nkq * 4 mb = 4 nb = 4 Nrhs = 1 # Matrix, mp=1, np=eh_comm.size self.eh_grid2a = BlacsGrid(self.world, dd_comm.size, eh_comm.size) # Vector, mp=eh_comm.size, np=1 self.eh_grid2b = BlacsGrid(self.world, 1, dd_comm.size * eh_comm.size) self.eh_descr2a = self.eh_grid2a.new_descriptor(N, M, nb, mb) self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N, Nrhs, nb) self.solve_descr2a = self.diag_grid.new_descriptor( N, M, blocksize, blocksize) self.solve_descr2b = self.diag_grid.new_descriptor( Nrhs, N, Nrhs, blocksize) self.redist_solve_in_2a = Redistributor(self.world, self.eh_descr2a, self.solve_descr2a) self.redist_solve_in_2b = Redistributor(self.world, self.eh_descr2b, self.solve_descr2b) self.redist_solve_out_2a = Redistributor(self.world, self.solve_descr2a, self.eh_descr2a) self.redist_solve_out_2b = Redistributor(self.world, self.solve_descr2b, self.eh_descr2b) def solve(self, A, b): #if 0: # print 'edescr2a', rank, self.eh_descr2a.asarray() # print 'edescr2b', rank, self.eh_descr2b.asarray() # # sys.stdout.flush() # self.world.barrier() # # print 'sdescr2a', rank, self.solve_descr2a.asarray() # print 'sdescr2b', rank, self.solve_descr2b.asarray() # # sys.stdout.flush() # self.world.barrier() # # print 'A ', rank, A.shape # if b is not None: # print 'b ', rank, b.shape # # sys.stdout.flush() # self.world.barrier() A_nn = self.solve_descr2a.empty(dtype=float) if self.eh_descr2a.blacsgrid.is_active(): A_Nn = A else: A_Nn = np.empty((0, 0), dtype=float) self.redist_solve_in_2a.redistribute(A_Nn, A_nn) b_n = self.solve_descr2b.empty(dtype=float) if self.eh_descr2b.blacsgrid.is_active(): b_N = b.reshape(1, len(b)) else: b_N = np.empty((A_Nn.shape[0], 0), dtype=float) self.redist_solve_in_2b.redistribute(b_N, b_n) #if 0: # print 'A_Nn ', rank, A_Nn.shape # print 'b_N ', rank, b_N.shape # sys.stdout.flush() # self.world.barrier() # print 'A_nn ', rank, A_nn.shape # print 'b_n ', rank, b_n.shape # sys.stdout.flush() # self.world.barrier() # # # print 'b_N ', rank, b_N # sys.stdout.flush() # self.world.barrier() # print 'b_n ', rank, b_n # sys.stdout.flush() # self.world.barrier() # # print 'A_Nn ', rank, A_Nn # sys.stdout.flush() # self.world.barrier() # print 'A_nn ', rank, A_nn # sys.stdout.flush() # self.world.barrier() info = 0 if self.solve_descr2a.blacsgrid.is_active(): _gpaw.scalapack_solve(A_nn, self.solve_descr2a.asarray(), b_n, self.solve_descr2b.asarray()) if info != 0: raise RuntimeError('scalapack_solve error: %d' % info) self.redist_solve_out_2b.redistribute(b_n, b_N) if self.eh_descr2b.blacsgrid.is_active(): b_N = b_N.flatten() else: b_N = b #self.dd_comm.broadcast(b_N, 0) b[:] = b_N def diagonalize(self, Om, eps_n): O_nn = self.diag_descr.empty(dtype=float) if self.eh_descr.blacsgrid.is_active(): O_nN = Om else: O_nN = np.empty((0, 0), dtype=float) self.redistributor_in.redistribute(O_nN, O_nn) self.diag_descr.diagonalize_dc(O_nn.copy(), O_nn, eps_n, 'L') self.redistributor_out.redistribute(O_nn, O_nN) self.world.broadcast(eps_n, 0) # Broadcast eigenvectors within domains if not self.eh_descr.blacsgrid.is_active(): O_nN = Om self.dd_comm.broadcast(O_nN, 0)
def diagonalize_full_hamiltonian(self, ham, atoms, occupations, txt, nbands=None, scalapack=None, expert=False): assert self.dtype == complex if nbands is None: nbands = self.pd.ngmin // self.bd.comm.size * self.bd.comm.size else: assert nbands <= self.pd.ngmin if expert: iu = nbands else: iu = None self.bd = bd = BandDescriptor(nbands, self.bd.comm) p = functools.partial(print, file=txt) p('Diagonalizing full Hamiltonian ({0} lowest bands)'.format(nbands)) p('Matrix size (min, max): {0}, {1}'.format(self.pd.ngmin, self.pd.ngmax)) mem = 3 * self.pd.ngmax**2 * 16 / bd.comm.size / 1024**2 p('Approximate memory usage per core: {0:.3f} MB'.format(mem)) if bd.comm.size > 1: if isinstance(scalapack, (list, tuple)): nprow, npcol, b = scalapack else: nprow = int(round(bd.comm.size**0.5)) while bd.comm.size % nprow != 0: nprow -= 1 npcol = bd.comm.size // nprow b = 64 p('ScaLapack grid: {0}x{1},'.format(nprow, npcol), 'block-size:', b) bg = BlacsGrid(bd.comm, bd.comm.size, 1) bg2 = BlacsGrid(bd.comm, nprow, npcol) scalapack = True else: nprow = npcol = 1 scalapack = False self.pt.set_positions(atoms.get_scaled_positions()) self.kpt_u[0].P_ani = None self.allocate_arrays_for_projections(self.pt.my_atom_indices) myslice = bd.get_slice() pb = ProgressBar(txt) nkpt = len(self.kpt_u) for u, kpt in enumerate(self.kpt_u): pb.update(u / nkpt) npw = len(self.pd.Q_qG[kpt.q]) if scalapack: mynpw = -(-npw // bd.comm.size) md = BlacsDescriptor(bg, npw, npw, mynpw, npw) md2 = BlacsDescriptor(bg2, npw, npw, b, b) else: md = md2 = MatrixDescriptor(npw, npw) with self.timer('Build H and S'): H_GG, S_GG = self.hs(ham, kpt.q, kpt.s, md) if scalapack: r = Redistributor(bd.comm, md, md2) H_GG = r.redistribute(H_GG) S_GG = r.redistribute(S_GG) psit_nG = md2.empty(dtype=complex) eps_n = np.empty(npw) with self.timer('Diagonalize'): if not scalapack: md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n, iu=iu) else: md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n) del H_GG, S_GG kpt.eps_n = eps_n[myslice].copy() if scalapack: md3 = BlacsDescriptor(bg, npw, npw, bd.mynbands, npw) r = Redistributor(bd.comm, md2, md3) psit_nG = r.redistribute(psit_nG) kpt.psit_nG = psit_nG[:bd.mynbands].copy() del psit_nG with self.timer('Projections'): self.pt.integrate(kpt.psit_nG, kpt.P_ani, kpt.q) kpt.f_n = None pb.finish() occupations.calculate(self)
def get_vchi(self, w_w=None, eta=0.1, q_c=[0.0, 0.0, 0.0], direction=0, ac=1.0, readfile=None, optical=True, write_eig=None): """Returns v * \chi where v is the bare Coulomb interaction""" self.get_bse_matrix(q_c=q_c, direction=direction, ac=ac, readfile=readfile, optical=optical, write_eig=write_eig) w_T = self.w_T rhoG0_S = self.rhoG0_S df_S = self.df_S print('Calculating response function at %s frequency points' % len(w_w), file=self.fd) vchi_w = np.zeros(len(w_w), dtype=complex) if not self.td: C_T = np.zeros(self.nS - len(self.excludef_S), complex) if world.rank == 0: A_T = np.dot(rhoG0_S, self.v_ST) B_T = np.dot(rhoG0_S * df_S, self.v_ST) tmp = np.dot(self.v_ST.conj().T, self.v_ST) overlap_tt = np.linalg.inv(tmp) C_T = np.dot(B_T.conj(), overlap_tt.T) * A_T world.broadcast(C_T, 0) else: A_t = np.dot(rhoG0_S, self.v_St) B_t = np.dot(rhoG0_S * df_S, self.v_St) if world.size == 1: C_T = B_t.conj() * A_t else: Nv = self.nv * (self.spinors + 1) Nc = self.nc * (self.spinors + 1) Ns = self.spins nS = self.nS ns = -(-self.kd.nbzkpts // world.size) * Nv * Nc * Ns grid = BlacsGrid(world, world.size, 1) desc = grid.new_descriptor(nS, 1, ns, 1) C_t = desc.empty(dtype=complex) C_t[:, 0] = B_t.conj() * A_t C_T = desc.collect_on_master(C_t)[:, 0] if world.rank != 0: C_T = np.empty(nS, dtype=complex) world.broadcast(C_T, 0) eta /= Hartree for iw, w in enumerate(w_w / Hartree): tmp_T = 1. / (w - w_T + 1j * eta) vchi_w[iw] += np.dot(tmp_T, C_T) vchi_w *= 4 * np.pi / self.vol if not np.allclose(self.q_c, 0.0): cell_cv = self.calc.wfs.gd.cell_cv B_cv = 2 * np.pi * np.linalg.inv(cell_cv).T q_v = np.dot(q_c, B_cv) vchi_w /= np.dot(q_v, q_v) """Check f-sum rule.""" nv = self.calc.wfs.setups.nvalence dw_w = (w_w[1:] - w_w[:-1]) / Hartree wchi_w = (w_w[1:] * vchi_w[1:] + w_w[:-1] * vchi_w[:-1]) / Hartree / 2 N = -np.dot(dw_w, wchi_w.imag) * self.vol / (2 * np.pi**2) print(file=self.fd) print('Checking f-sum rule:', file=self.fd) print(' Valence = %s, N = %f' % (nv, N), file=self.fd) print(file=self.fd) if write_eig is not None: if world.rank == 0: f = open(write_eig, 'w') print('# %s eigenvalues in eV' % self.mode, file=f) for iw, w in enumerate(self.w_T * Hartree): print('%8d %12.6f %12.16f' % (iw, w.real, C_T[iw].real), file=f) f.close() return vchi_w * ac
def main(N=72, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype == complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: glob = grid.new_descriptor(N, N, N, N) # print globA.asarray() # Populate matrices local to master: H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) C0 = glob.empty(dtype=dtype) if rank == 0: # Complex case must have real numbers on the diagonal. # We make a simple complex Hermitian matrix below. H0 = H0 + epsilon * (0.1 * np.tri(N, N, k=-N // nprocs) + 0.3 * np.tri(N, N, k=-1)) S0 = S0 + epsilon * (0.2 * np.tri(N, N, k=-N // nprocs) + 0.4 * np.tri(N, N, k=-1)) # Make matrices symmetric rk(1.0, H0.copy(), 0.0, H0) rk(1.0, S0.copy(), 0.0, S0) # Overlap matrix must be semi-positive definite S0 = S0 + 50.0 * np.eye(N, N, 0) # Hamiltonian is usually diagonally dominant H0 = H0 + 75.0 * np.eye(N, N, 0) C0 = S0.copy() S0_inv = S0.copy() # Local result matrices W0 = np.empty((N), dtype=float) W0_g = np.empty((N), dtype=float) # Calculate eigenvalues / other serial results if rank == 0: diagonalize(H0.copy(), W0) general_diagonalize(H0.copy(), W0_g, S0.copy()) inverse_cholesky(C0) # result returned in lower triangle tri2full(S0_inv, 'L') S0_inv = inv(S0_inv) # tri2full(C0) # symmetrize assert glob.check(H0) and glob.check(S0) and glob.check(C0) # Create distributed destriptors with various block sizes: dist = grid.new_descriptor(N, N, 8, 8) # Distributed matrices: # We can use empty here, but end up with garbage on # on the other half of the triangle when we redistribute. # This is fine because ScaLAPACK does not care. H = dist.empty(dtype=dtype) S = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) Z = dist.empty(dtype=dtype) C = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) # Eigenvalues are non-BLACS matrices W = np.empty((N), dtype=float) W_dc = np.empty((N), dtype=float) W_mr3 = np.empty((N), dtype=float) W_g = np.empty((N), dtype=float) W_g_dc = np.empty((N), dtype=float) W_g_mr3 = np.empty((N), dtype=float) Glob2dist = Redistributor(world, glob, dist) Glob2dist.redistribute(H0, H, uplo='L') Glob2dist.redistribute(S0, S, uplo='L') Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten Glob2dist.redistribute(S0, Sinv, uplo='L') # we don't test the expert drivers anymore since there # might be a buffer overflow error ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L') scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L') ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L') ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L') scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L') ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L') scalapack_inverse_cholesky(dist, C, 'L') if dtype == complex: # Only supported for complex for now scalapack_inverse(dist, Sinv, 'L') # Undo redistribute C_test = glob.empty(dtype=dtype) Sinv_test = glob.empty(dtype=dtype) Dist2glob = Redistributor(world, dist, glob) Dist2glob.redistribute(C, C_test) Dist2glob.redistribute(Sinv, Sinv_test) if rank == 0: ## diag_ex_err = abs(W - W0).max() diag_dc_err = abs(W_dc - W0).max() ## diag_mr3_err = abs(W_mr3 - W0).max() ## general_diag_ex_err = abs(W_g - W0_g).max() general_diag_dc_err = abs(W_g_dc - W0_g).max() ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max() inverse_chol_err = abs(C_test - C0).max() tri2full(Sinv_test, 'L') inverse_err = abs(Sinv_test - S0_inv).max() ## print 'diagonalize ex err', diag_ex_err print('diagonalize dc err', diag_dc_err) ## print 'diagonalize mr3 err', diag_mr3_err ## print 'general diagonalize ex err', general_diag_ex_err print('general diagonalize dc err', general_diag_dc_err) ## print 'general diagonalize mr3 err', general_diag_mr3_err print('inverse chol err', inverse_chol_err) if dtype == complex: print('inverse err', inverse_err) else: ## diag_ex_err = 0.0 diag_dc_err = 0.0 ## diag_mr3_err = 0.0 ## general_diag_ex_err = 0.0 general_diag_dc_err = 0.0 ## general_diag_mr3_err = 0.0 inverse_chol_err = 0.0 inverse_err = 0.0 # We don't like exceptions on only one cpu ## diag_ex_err = world.sum(diag_ex_err) diag_dc_err = world.sum(diag_dc_err) ## diag_mr3_err = world.sum(diag_mr3_err) ## general_diag_ex_err = world.sum(general_diag_ex_err) general_diag_dc_err = world.sum(general_diag_dc_err) ## general_diag_mr3_err = world.sum(general_diag_mr3_err) inverse_chol_err = world.sum(inverse_chol_err) inverse_err = world.sum(inverse_err) ## assert diag_ex_err < tol assert diag_dc_err < tol ## assert diag_mr3_err < tol ## assert general_diag_ex_err < tol assert general_diag_dc_err < tol ## assert general_diag_mr3_err < tol assert inverse_chol_err < tol if dtype == complex: assert inverse_err < tol
# in trunk/gpaw/blacs.py for some discussions of # these idiosyncracies. import numpy as np from gpaw.blacs import BlacsGrid, parallelprint from gpaw.mpi import world, rank, size from gpaw.utilities.scalapack import pblas_simple_gemm gen = np.random.RandomState(42) # simulate state-parallelization=2 and # domain-decomposition.prod=32 B = 2 D = 32 mb = 32 grid = BlacsGrid(world, B, D) nbands = 500 nG = 80 ** 3 nGdesc = grid.new_descriptor(nbands, nG, nbands / B, nG / D) nndesc = grid.new_descriptor(nbands, nbands, mb, mb) psit_nG = gen.rand(*nGdesc.shape) A_nn = gen.rand(*nndesc.shape) assert nGdesc.check(psit_nG) assert nndesc.check(A_nn) parallelprint(world, (A_nn.shape, nndesc.shape, nndesc.lld))
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean HEA0 = np.ascontiguousarray(HEA0) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo='L', side='L') # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print('gemm err', gemm_err) print('gemv err', gemv_err) print('r2k err', r2k_err) print('rk_err', rk_err) print('hemm_err', hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)
def calculate_rkernel(self): gd = self.gd ng_c = gd.N_c cell_cv = gd.cell_cv icell_cv = 2 * np.pi * np.linalg.inv(cell_cv) vol = np.linalg.det(cell_cv) ns = self.calc.wfs.nspins n_g = self.n_g # density on rough grid fx_g = ns * self.get_fxc_g(n_g) # local exchange kernel qc_g = (-4 * np.pi * ns / fx_g)**0.5 # cutoff functional flocal_g = qc_g**3 * fx_g / (6 * np.pi**2) # ren. x-kernel for r=r' Vlocal_g = 2 * qc_g / np.pi # ren. Hartree kernel for r=r' ng = np.prod(ng_c) # number of grid points r_vg = gd.get_grid_point_coordinates() rx_g = r_vg[0].flatten() ry_g = r_vg[1].flatten() rz_g = r_vg[2].flatten() prnt(' %d grid points and %d plane waves at the Gamma point' % (ng, self.pd.ngmax), file=self.fd) # Unit cells R_Rv = [] weight_R = [] nR_v = self.unit_cells nR = np.prod(nR_v) for i in range(-nR_v[0] + 1, nR_v[0]): for j in range(-nR_v[1] + 1, nR_v[1]): for h in range(-nR_v[2] + 1, nR_v[2]): R_Rv.append(i * cell_cv[0] + j * cell_cv[1] + h * cell_cv[2]) weight_R.append((nR_v[0] - abs(i)) * (nR_v[1] - abs(j)) * (nR_v[2] - abs(h)) / float(nR)) if nR > 1: # with more than one unit cell only the exchange kernel is # calculated on the grid. The bare Coulomb kernel is added # in PW basis and Vlocal_g only the exchange part dv = self.calc.density.gd.dv gc = (3 * dv / 4 / np.pi)**(1 / 3.) Vlocal_g -= 2 * np.pi * gc**2 / dv prnt(' Lattice point sampling: ' + '(%s x %s x %s)^2 ' % (nR_v[0], nR_v[1], nR_v[2]) + ' Reduced to %s lattice points' % len(R_Rv), file=self.fd) l_g_size = -(-ng // mpi.world.size) l_g_range = range(mpi.world.rank * l_g_size, min((mpi.world.rank+1) * l_g_size, ng)) fhxc_qsGr = {} for iq in range(len(self.ibzq_qc)): fhxc_qsGr[iq] = np.zeros((ns, len(self.pd.G2_qG[iq]), len(l_g_range)), dtype=complex) inv_error = np.seterr() np.seterr(invalid='ignore') np.seterr(divide='ignore') t0 = time() # Loop over Lattice points for i, R_v in enumerate(R_Rv): # Loop over r'. f_rr and V_rr are functions of r (dim. as r_vg[0]) if i == 1: prnt(' Finished 1 cell in %s seconds' % int(time() - t0) + ' - estimated %s seconds left' % int((len(R_Rv) - 1) * (time() - t0)), file=self.fd) self.fd.flush() if len(R_Rv) > 5: if (i+1) % (len(R_Rv) / 5 + 1) == 0: prnt(' Finished %s cells in %s seconds' % (i, int(time() - t0)) + ' - estimated %s seconds left' % int((len(R_Rv) - i) * (time() - t0) / i), file=self.fd) self.fd.flush() for g in l_g_range: rx = rx_g[g] + R_v[0] ry = ry_g[g] + R_v[1] rz = rz_g[g] + R_v[2] # |r-r'-R_i| rr = ((r_vg[0] - rx)**2 + (r_vg[1] - ry)**2 + (r_vg[2] - rz)**2)**0.5 n_av = (n_g + n_g.flatten()[g]) / 2. fx_g = ns * self.get_fxc_g(n_av, index=g) qc_g = (-4 * np.pi * ns / fx_g)**0.5 x = qc_g * rr osc_x = np.sin(x) - x*np.cos(x) f_rr = fx_g * osc_x / (2 * np.pi**2 * rr**3) if nR > 1: # include only exchange part of the kernel here V_rr = (sici(x)[0] * 2 / np.pi - 1) / rr else: # include the full kernel (also hartree part) V_rr = (sici(x)[0] * 2 / np.pi) / rr # Terms with r = r' if (np.abs(R_v) < 0.001).all(): tmp_flat = f_rr.flatten() tmp_flat[g] = flocal_g.flatten()[g] f_rr = tmp_flat.reshape(ng_c) tmp_flat = V_rr.flatten() tmp_flat[g] = Vlocal_g.flatten()[g] V_rr = tmp_flat.reshape(ng_c) del tmp_flat f_rr[np.where(n_av < self.density_cut)] = 0.0 V_rr[np.where(n_av < self.density_cut)] = 0.0 f_rr *= weight_R[i] V_rr *= weight_R[i] # r-r'-R_i r_r = np.array([r_vg[0] - rx, r_vg[1] - ry, r_vg[2] - rz]) # Fourier transform of r for iq, q in enumerate(self.ibzq_qc): q_v = np.dot(q, icell_cv) e_q = np.exp(-1j * gemmdot(q_v, r_r, beta=0.0)) f_q = self.pd.fft((f_rr + V_rr) * e_q, iq) * vol / ng fhxc_qsGr[iq][0, :, g - l_g_range[0]] += f_q if ns == 2: f_q = self.pd.fft(V_rr * e_q, iq) * vol / ng fhxc_qsGr[iq][1, :, g - l_g_range[0]] += f_q mpi.world.barrier() np.seterr(**inv_error) for iq, q in enumerate(self.ibzq_qc): npw = len(self.pd.G2_qG[iq]) fhxc_sGsG = np.zeros((ns * npw, ns * npw), complex) l_pw_size = -(-npw // mpi.world.size) # parallelize over PW below l_pw_range = range(mpi.world.rank * l_pw_size, min((mpi.world.rank + 1) * l_pw_size, npw)) if mpi.world.size > 1: # redistribute grid and plane waves in fhxc_qsGr[iq] bg1 = BlacsGrid(mpi.world, 1, mpi.world.size) bg2 = BlacsGrid(mpi.world, mpi.world.size, 1) bd1 = bg1.new_descriptor(npw, ng, npw, - (-ng / mpi.world.size)) bd2 = bg2.new_descriptor(npw, ng, -(-npw / mpi.world.size), ng) fhxc_Glr = np.zeros((len(l_pw_range), ng), dtype=complex) if ns == 2: Koff_Glr = np.zeros((len(l_pw_range), ng), dtype=complex) r = Redistributor(bg1.comm, bd1, bd2) r.redistribute(fhxc_qsGr[iq][0], fhxc_Glr, npw, ng) if ns == 2: r.redistribute(fhxc_qsGr[iq][1], Koff_Glr, npw, ng) else: fhxc_Glr = fhxc_qsGr[iq][0] if ns == 2: Koff_Glr = fhxc_qsGr[iq][1] # Fourier transform of r' for iG in range(len(l_pw_range)): f_g = fhxc_Glr[iG].reshape(ng_c) f_G = self.pd.fft(f_g.conj(), iq) * vol / ng fhxc_sGsG[l_pw_range[0] + iG, :npw] = f_G.conj() if ns == 2: v_g = Koff_Glr[iG].reshape(ng_c) v_G = self.pd.fft(v_g.conj(), iq) * vol / ng fhxc_sGsG[npw + l_pw_range[0] + iG, :npw] = v_G.conj() if ns == 2: # f_00 = f_11 and f_01 = f_10 fhxc_sGsG[:npw, npw:] = fhxc_sGsG[npw:, :npw] fhxc_sGsG[npw:, npw:] = fhxc_sGsG[:npw, :npw] mpi.world.sum(fhxc_sGsG) fhxc_sGsG /= vol if mpi.rank == 0: w = Writer('fhxc_%s_%s_%s_%s.gpw' % (self.tag, self.xc, self.ecut, iq)) w.dimension('sG', ns * npw) w.add('fhxc_sGsG', ('sG', 'sG'), dtype=complex) if nR > 1: # add Hartree kernel evaluated in PW basis Gq2_G = self.pd.G2_qG[iq] if (q == 0).all(): Gq2_G[0] = 1. vq_G = 4 * np.pi / Gq2_G fhxc_sGsG += np.tile(np.eye(npw) * vq_G, (ns, ns)) w.fill(fhxc_sGsG) w.close() mpi.world.barrier() prnt(file=self.fd)
import numpy as np from gpaw.utilities.elpa import LibElpa from gpaw.blacs import BlacsGrid from gpaw.mpi import world rng = np.random.RandomState(87878787) if world.size == 1: shape = 1, 1 else: shape = world.size // 2, 2 bg = BlacsGrid(world, *shape) M = 8 blocksize = 2 desc = bg.new_descriptor(M, M, blocksize, blocksize) sdesc = desc.as_serial() Aserial = sdesc.zeros() if world.rank == 0: Aserial[:] = rng.rand(*Aserial.shape) Aserial += Aserial.T.copy() A = desc.distribute_from_master(Aserial) C1 = desc.zeros() C2 = desc.zeros() eps1 = np.zeros(M) eps2 = np.zeros(M) elpa = LibElpa(desc) print(elpa)
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa="T") pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo="L", side="L") # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print("gemm err", gemm_err) print("gemv err", gemv_err) print("r2k err", r2k_err) print("rk_err", rk_err) print("hemm_err", hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)