def __init__(self, sl_lrtddft, nrows, lr_comms): self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft) self.lr_comms = lr_comms # original grid, ie, how matrix is stored self.matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.lr_comms.dd_comm.size, self.lr_comms.eh_comm.size) # diagonalization grid self.diag_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # # M = rows, N = cols M = nrows N = nrows mb = 1 nb = 1 self.matrix_descr = self.matrix_grid.new_descriptor(N, M, nb, mb) bs = self.block_size self.diag_descr = self.diag_grid.new_descriptor(N, M, bs, bs) self.diag_in_redist = Redistributor(self.lr_comms.parent_comm, self.matrix_descr, self.diag_descr) self.diag_out_redist = Redistributor(self.lr_comms.parent_comm, self.diag_descr, self.matrix_descr)
def scalapack_diagonalize(self, H_sS): mb = 32 N = self.nS g1 = BlacsGrid(world, size, 1) g2 = BlacsGrid(world, size // 2, 2) nndesc1 = g1.new_descriptor(N, N, self.nS_local, N) nndesc2 = g2.new_descriptor(N, N, mb, mb) A_ss = nndesc2.empty(dtype=H_sS.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(H_sS, A_ss) # diagonalize v_ss = nndesc2.zeros(dtype=A_ss.dtype) w_S = np.zeros(N, dtype=float) nndesc2.diagonalize_dc(A_ss, v_ss, w_S, 'L') # distribute the eigenvectors to master v_sS = np.zeros_like(H_sS) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(v_ss, v_sS) # v2_SS = np.zeros((self.nS, self.nS), dtype=complex) # world.all_gather(v_sS, v2_SS) return w_S, v_sS.conj()
def diagonalize(self): print('Diagonalizing Hamiltonian', file=self.fd) """The t and T represent local and global eigenstates indices respectively """ # Non-Hermitian matrix can only use linalg.eig if not self.td: print(' Using numpy.linalg.eig...', file=self.fd) print(' Eliminated %s pair orbitals' % len(self.excludef_S), file=self.fd) self.H_SS = self.collect_A_SS(self.H_sS) self.w_T = np.zeros(self.nS - len(self.excludef_S), complex) if world.rank == 0: self.H_SS = np.delete(self.H_SS, self.excludef_S, axis=0) self.H_SS = np.delete(self.H_SS, self.excludef_S, axis=1) self.w_T, self.v_ST = np.linalg.eig(self.H_SS) world.broadcast(self.w_T, 0) self.df_S = np.delete(self.df_S, self.excludef_S) self.rhoG0_S = np.delete(self.rhoG0_S, self.excludef_S) # Here the eigenvectors are returned as complex conjugated rows else: if world.size == 1: print(' Using lapack...', file=self.fd) from gpaw.utilities.lapack import diagonalize self.w_T = np.zeros(self.nS) diagonalize(self.H_sS, self.w_T) self.v_St = self.H_sS.conj().T else: print(' Using scalapack...', file=self.fd) nS = self.nS ns = -(-self.kd.nbzkpts // world.size) * (self.nv * self.nc * self.spins * (self.spinors + 1)**2) grid = BlacsGrid(world, world.size, 1) desc = grid.new_descriptor(nS, nS, ns, nS) desc2 = grid.new_descriptor(nS, nS, 2, 2) H_tmp = desc2.zeros(dtype=complex) r = Redistributor(world, desc, desc2) r.redistribute(self.H_sS, H_tmp) self.w_T = np.empty(nS) v_tmp = desc2.empty(dtype=complex) desc2.diagonalize_dc(H_tmp, v_tmp, self.w_T) r = Redistributor(grid.comm, desc2, desc) self.v_St = desc.zeros(dtype=complex) r.redistribute(v_tmp, self.v_St) self.v_St = self.v_St.conj().T if self.write_v and self.td: # Cannot use par_save without td self.par_save('v_TS.ulm', 'v_TS', self.v_St.T) return
def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, nao, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) nbands = bd.nbands self.blocksize = blocksize self.mynbands = mynbands = bd.mynbands self.orbital_comm = self.bd.comm self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size) self.nao = nao # Range of basis functions for BLACS distribution of matrices: self.Mmax = nao self.Mstart = bd.comm.rank * naoblocksize self.Mstop = min(self.Mstart + naoblocksize, self.Mmax) self.mynao = self.Mstop - self.Mstart # Column layout for one matrix per band rank: self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1) self.mMdescriptor = self.columngrid.new_descriptor( nao, nao, naoblocksize, nao) self.nMdescriptor = self.columngrid.new_descriptor( nbands, nao, mynbands, nao) #parallelprint(world, (mynao, self.mMdescriptor.shape)) # Column layout for one matrix in total (only on grid masters): self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.mM_unique_descriptor = self.single_column_grid.new_descriptor( \ nao, nao, naoblocksize, nao) # nM_unique_descriptor is meant to hold the coefficients after # diagonalization. BLACS requires it to be nao-by-nao, but # we only fill meaningful data into the first nbands columns. # # The array will then be trimmed and broadcast across # the grid descriptor's communicator. self.nM_unique_descriptor = self.single_column_grid.new_descriptor( \ nbands, nao, mynbands, nao) # Fully blocked grid for diagonalization with many CPUs: self.mmdescriptor = self.blockgrid.new_descriptor( nao, nao, blocksize, blocksize) #self.nMdescriptor = nMdescriptor self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor, self.mmdescriptor) self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor, self.nM_unique_descriptor)
def test_overlaps_nonhermitian(self): alpha = np.random.normal(size=1).astype(self.dtype) if self.dtype == complex: alpha += 1j * np.random.normal(size=1) world.broadcast(alpha, 0) # Set up non-Hermitian overlap operator: S = lambda x: alpha * x dS = lambda a, P_ni: np.dot(alpha * P_ni, self.setups[a].dO_ii) nblocks = self.get_optimal_number_of_blocks(self.blocking) overlap = MatrixOperator(self.ksl, nblocks, self. async, False) if 0: #XXX non-hermitian case so Nn2nn not just uplo='L' but rather 'G' blockcomm = self.ksl.nndescriptor.blacsgrid.comm self.ksl.Nn2nn = Redistributor(blockcomm, self.ksl.Nndescriptor, self.ksl.nndescriptor) S_nn = overlap.calculate_matrix_elements(self.psit_nG, self.P_ani, S, dS) if memstats: self.mem_test = record_memory() S_NN = self.ksl.nndescriptor.collect_on_master(S_nn) if self.bd.comm.rank == 0 and self.gd.comm.rank == 0: assert S_NN.shape == (self.bd.nbands, ) * 2 S_NN = S_NN.T.copy() # Fortran -> C indexing else: assert S_NN.nbytes == 0 S_NN = np.empty((self.bd.nbands, ) * 2, dtype=S_NN.dtype) if self.bd.comm.rank == 0: self.gd.comm.broadcast(S_NN, 0) self.bd.comm.broadcast(S_NN, 0) self.check_and_plot(S_NN, alpha * self.S0_nn, 9, 'overlaps,nonhermitian')
def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]).conj() else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='T') return rho_mm
def parallel_eigh(matrixfile, blacsgrid=(4, 2), blocksize=64): """Diagonalize matrix in parallel""" assert np.prod(blacsgrid) == world.size grid = BlacsGrid(world, *blacsgrid) if world.rank == MASTER: H_MM = np.load(matrixfile) assert H_MM.ndim == 2 assert H_MM.shape[0] == H_MM.shape[1] NM = len(H_MM) else: NM = 0 NM = world.sum(NM) # Distribute matrix shape to all nodes # descriptor for the individual blocks block_desc = grid.new_descriptor(NM, NM, blocksize, blocksize) # descriptor for global array on MASTER local_desc = grid.new_descriptor(NM, NM, NM, NM) # Make some dummy array on all the slaves if world.rank != MASTER: H_MM = local_desc.zeros() assert local_desc.check(H_MM) # The local version of the matrix H_mm = block_desc.empty() # Distribute global array to smaller blocks redistributor = Redistributor(world, local_desc, block_desc) redistributor.redistribute(H_MM, H_mm) # Allocate arrays for eigenvalues and -vectors eps_M = np.empty(NM) C_mm = block_desc.empty() block_desc.diagonalize_ex(H_mm, C_mm, eps_M) # Collect eigenvectors on MASTER C_MM = local_desc.empty() redistributor2 = Redistributor(world, block_desc, local_desc) redistributor2.redistribute(C_mm, C_MM) # Return eigenvalues and -vectors on Master if world.rank == MASTER: return eps_M, C_MM else: return None, None
def test_multiply_nonhermitian(self): alpha = np.random.normal(size=1).astype(self.dtype) if self.dtype == complex: alpha += 1j * np.random.normal(size=1) world.broadcast(alpha, 0) # Known starting point of S_nn = <psit_m|S|psit_n> S_NN = alpha * self.S0_nn if self.dtype == complex: C_NN = np.random.uniform(size=self.nbands**2) * \ np.exp(1j*np.random.uniform(0,2*np.pi,size=self.nbands**2)) else: C_NN = np.random.normal(size=self.nbands**2) C_NN = C_NN.reshape( (self.nbands, self.nbands)) / np.linalg.norm(C_NN, 2) world.broadcast(C_NN, 0) # Set up Hermitian overlap operator: S = lambda x: alpha * x dS = lambda a, P_ni: np.dot(alpha * P_ni, self.setups[a].dO_ii) nblocks = self.get_optimal_number_of_blocks(self.blocking) overlap = MatrixOperator(self.ksl, nblocks, self. async, False) if 0: #XXX non-hermitian case so Nn2nn not just uplo='L' but rather 'G' blockcomm = self.ksl.nndescriptor.blacsgrid.comm self.ksl.Nn2nn = Redistributor(blockcomm, self.ksl.Nndescriptor, self.ksl.nndescriptor) if self.bd.comm.rank == 0 and self.gd.comm.rank == 0: assert C_NN.shape == (self.bd.nbands, ) * 2 tmp_NN = C_NN.T.copy() # C -> Fortran indexing else: tmp_NN = self.ksl.nndescriptor.as_serial().empty(dtype=C_NN.dtype) C_nn = self.ksl.nndescriptor.distribute_from_master(tmp_NN) self.psit_nG = overlap.matrix_multiply(C_nn, self.psit_nG, self.P_ani) D_nn = overlap.calculate_matrix_elements(self.psit_nG, self.P_ani, S, dS) if memstats: self.mem_test = record_memory() D_NN = self.ksl.nndescriptor.collect_on_master(D_nn) if self.bd.comm.rank == 0 and self.gd.comm.rank == 0: assert D_NN.shape == (self.bd.nbands, ) * 2 D_NN = D_NN.T.copy() # Fortran -> C indexing else: assert D_NN.nbytes == 0 D_NN = np.empty((self.bd.nbands, ) * 2, dtype=D_NN.dtype) if self.bd.comm.rank == 0: self.gd.comm.broadcast(D_NN, 0) self.bd.comm.broadcast(D_NN, 0) # D_nn = C_nn^dag * S_nn * C_nn D0_NN = np.dot(C_NN.T.conj(), np.dot(S_NN, C_NN)) self.check_and_plot(D_NN, D0_NN, 9, 'multiply,nonhermitian')
def scal_diagonalize(A, nodes='master'): # Diagonalize matrix A (size N*N) with scalapack # Usage: eps, B = scal_diagonalize(A) # eps and B and the eigenvalues and eigenvectors # nodes = 'master': eigenvectors only available on master node # nodes = 'all': eigenvectors broadcast to all nodes # make sure A is N*N, and hermitian N = A.shape[0] assert A.shape[0] == A.shape[1] for i in range(N): for j in range(i, N): assert A[i, j] == A[j, i].conj() # create blacs descriptor mb = 64 g = BlacsGrid(world, 2, size // 2) nndesc1 = g.new_descriptor(N, N, N, N) nndesc2 = g.new_descriptor(N, N, mb, mb) # distribute A to blacs grid A_ if rank != 0: A = nndesc1.zeros(dtype=A.dtype) A_ = nndesc2.empty(dtype=A.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(A, A_) # diagonalize B_ = nndesc2.zeros(dtype=A.dtype) eps = np.zeros(N, dtype=A.dtype) nndesc2.diagonalize_dc(A_, B_, eps, 'L') # distribute the eigenvectors to master B = np.zeros_like(A) redistributor = Redistributor(world, nndesc2, nndesc1) redistributor.redistribute(B_, B) if nodes == 'master': return eps, B elif nodes == 'all': if rank != 0: B = np.zeros((N, N)) world.broadcast(B, 0) return eps, B
def distribute_to_columns(self, rho_mm, srcdescriptor): redistributor = Redistributor( self.block_comm, # XXX srcdescriptor, self.mM_unique_descriptor) rho_mM = redistributor.redistribute(rho_mm) if self.gd.rank != 0: rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM
def initialize(self, paw, hamiltonian=None): LCAOPropagator.initialize(self, paw) if hamiltonian is not None: self.hamiltonian = hamiltonian ksl = self.wfs.ksl self.blacs = ksl.using_blacs if self.blacs: from gpaw.blacs import Redistributor self.log('BLACS Parallelization') # Parallel grid descriptors grid = ksl.blockgrid assert grid.nprow * grid.npcol == ksl.block_comm.size self.mm_block_descriptor = ksl.mmdescriptor self.Cnm_block_descriptor = grid.new_descriptor(ksl.bd.nbands, ksl.nao, ksl.blocksize, ksl.blocksize) self.CnM_unique_descriptor = ksl.nM_unique_descriptor # Redistributors self.Cnm2nM = Redistributor(ksl.block_comm, self.Cnm_block_descriptor, self.CnM_unique_descriptor) self.CnM2nm = Redistributor(ksl.block_comm, self.CnM_unique_descriptor, self.Cnm_block_descriptor) if debug: nao = ksl.nao self.MM_descriptor = grid.new_descriptor(nao, nao, nao, nao) self.mm2MM = Redistributor(ksl.block_comm, self.mm_block_descriptor, self.MM_descriptor) self.MM2mm = Redistributor(ksl.block_comm, self.MM_descriptor, self.mm_block_descriptor) for kpt in self.wfs.kpt_u: scalapack_zero(self.mm_block_descriptor, kpt.S_MM, 'U') scalapack_zero(self.mm_block_descriptor, kpt.T_MM, 'U')
def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, buffer_size=None, timer=nulltimer): BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, blocksize, timer) self.buffer_size = buffer_size nbands = bd.nbands self.mynbands = mynbands = bd.mynbands self.blocksize = blocksize # 1D layout - columns self.columngrid = BlacsGrid(self.column_comm, 1, bd.comm.size) self.Nndescriptor = self.columngrid.new_descriptor( nbands, nbands, nbands, mynbands) # 2D layout self.nndescriptor = self.blockgrid.new_descriptor( nbands, nbands, blocksize, blocksize) # 1D layout - rows self.rowgrid = BlacsGrid(self.column_comm, bd.comm.size, 1) self.nNdescriptor = self.rowgrid.new_descriptor( nbands, nbands, mynbands, nbands) # Only redistribute filled out half for Hermitian matrices self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor, self.nndescriptor) #self.Nn2nn = Redistributor(self.block_comm, self.Nndescriptor, # self.nndescriptor, 'L') #XXX faster but... # Resulting matrix will be used in dgemm which is symmetry obvlious self.nn2nN = Redistributor(self.block_comm, self.nndescriptor, self.nNdescriptor)
def redistribute_H(self, H_sS): g1 = BlacsGrid(world, size, 1) g2 = BlacsGrid(world, 1, size) N = self.nS nndesc1 = g1.new_descriptor(N, N, self.nS_local, N) nndesc2 = g2.new_descriptor(N, N, N, self.nS_local) H_Ss = nndesc2.empty(dtype=H_sS.dtype) redistributor = Redistributor(world, nndesc1, nndesc2) redistributor.redistribute(H_sS, H_Ss) return H_Ss
def distribute_MM(wfs, a_MM): ksl = wfs.ksl if not ksl.using_blacs: return a_MM dtype = a_MM.dtype ksl_comm = ksl.block_comm NM = ksl.nao grid = BlacsGrid(ksl_comm, 1, 1) MM_descriptor = grid.new_descriptor(NM, NM, NM, NM) MM2mm = Redistributor(ksl_comm, MM_descriptor, ksl.mmdescriptor) if ksl_comm.rank != 0: a_MM = MM_descriptor.empty(dtype=dtype) a_mm = ksl.mmdescriptor.empty(dtype=dtype) MM2mm.redistribute(a_MM, a_mm) return a_mm
def redistribute(self, in_wGG, out_x=None): """Redistribute array. Switch between two kinds of parallel distributions: 1) parallel over G-vectors (second dimension of in_wGG) 2) parallel over frequency (first dimension of in_wGG) Returns new array using the memory in the 1-d array out_x. """ comm = self.blockcomm if comm.size == 1: return in_wGG nw = len(self.omega_w) nG = in_wGG.shape[2] mynw = (nw + comm.size - 1) // comm.size mynG = (nG + comm.size - 1) // comm.size bg1 = BlacsGrid(comm, comm.size, 1) bg2 = BlacsGrid(comm, 1, comm.size) md1 = BlacsDescriptor(bg1, nw, nG**2, mynw, nG**2) md2 = BlacsDescriptor(bg2, nw, nG**2, nw, mynG * nG) if len(in_wGG) == nw: mdin = md2 mdout = md1 else: mdin = md1 mdout = md2 r = Redistributor(comm, mdin, mdout) outshape = (mdout.shape[0], mdout.shape[1] // nG, nG) if out_x is None: out_wGG = np.empty(outshape, complex) else: out_wGG = out_x[:np.product(outshape)].reshape(outshape) r.redistribute(in_wGG.reshape(mdin.shape), out_wGG.reshape(mdout.shape)) return out_wGG
def test(comm, M, N, mcpus, ncpus, mb, nb): grid0 = BlacsGrid(comm, 1, 1) desc0 = grid0.new_descriptor(M, N, M, N, 0, 0) A_mn = desc0.zeros(dtype=float) A_mn[:] = comm.size + 1 grid1 = BlacsGrid(comm, mcpus, ncpus) desc1 = grid1.new_descriptor(M, N, mb, nb, 0, 0) # ??? B_mn = desc1.zeros(dtype=float) B_mn[:] = comm.rank if comm.rank == 0: msg = 'Slices of global matrix indices by rank' print(msg) print('-' * len(msg)) for rank in range(comm.size): comm.barrier() if rank == comm.rank: print('Rank %d:' % rank) last_Mstart = -1 for Mstart, Mstop, Nstart, Nstop, block in desc1.my_blocks(B_mn): if Mstart > last_Mstart and last_Mstart >= 0: print() print('[%3d:%3d, %3d:%3d]' % (Mstart, Mstop, Nstart, Nstop), end=' ') last_Mstart = Mstart assert (block == comm.rank).all() #print block #print print() print() comm.barrier() redistributor = Redistributor(comm, desc1, desc0) redistributor.redistribute(B_mn, A_mn) if comm.rank == 0: msg = 'Rank where each element of the global matrix is stored' print(msg) print('-' * len(msg)) print(A_mn)
def collect_wuMM(wfs, a_wuMM, w, s, k): # This function is based on # gpaw/wavefunctions/base.py: WaveFunctions.collect_auxiliary() dtype = a_wuMM[0][0].dtype ksl = wfs.ksl NM = ksl.nao kpt_rank, u = wfs.kd.get_rank_and_index(s, k) ksl_comm = ksl.block_comm if wfs.kd.comm.rank == kpt_rank: a_MM = a_wuMM[w][u] # Collect within blacs grid if ksl.using_blacs: a_mm = a_MM grid = BlacsGrid(ksl_comm, 1, 1) MM_descriptor = grid.new_descriptor(NM, NM, NM, NM) mm2MM = Redistributor(ksl_comm, ksl.mmdescriptor, MM_descriptor) a_MM = MM_descriptor.empty(dtype=dtype) mm2MM.redistribute(a_mm, a_MM) # KSL master send a_MM to the global master if ksl_comm.rank == 0: if kpt_rank == 0: assert wfs.world.rank == 0 # I have it already return a_MM else: wfs.kd.comm.send(a_MM, 0, 2017) return None elif ksl_comm.rank == 0 and kpt_rank != 0: assert wfs.world.rank == 0 a_MM = np.empty((NM, NM), dtype=dtype) wfs.kd.comm.receive(a_MM, kpt_rank, 2017) return a_MM
def calculate_density_matrix(self, f_n, C_nM, rho_mM=None): """Calculate density matrix from occupations and coefficients. Presently this function performs the usual scalapack 3-step trick: redistribute-numbercrunching-backdistribute. Notes on future performance improvement. As per the current framework, C_nM exists as copies on each domain, i.e. this is not parallel over domains. We'd like to correct this and have an efficient distribution using e.g. the block communicator. The diagonalization routine and other parts of the code should however be changed to accommodate the following scheme: Keep coefficients in C_mm form after the diagonalization. rho_mm can then be directly calculated from C_mm without redistribution, after which we only need to redistribute rho_mm across domains. """ dtype = C_nM.dtype rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM) rback = Redistributor(self.block_comm, self.mmdescriptor, self.mM_unique_descriptor) rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype) rback.redistribute(rho_mm, rho1_mM) del rho_mm if rho_mM is None: if self.gd.rank == 0: rho_mM = rho1_mM else: rho_mM = self.mMdescriptor.zeros(dtype=dtype) self.gd.comm.broadcast(rho_mM, 0) return rho_mM
def distribute_frequencies(self, chi0_wGG): """Distribute frequencies to all cores.""" world = self.world comm = self.blockcomm if world.size == 1: return chi0_wGG nw = len(self.omega_w) nG = chi0_wGG.shape[2] mynw = (nw + world.size - 1) // world.size mynG = (nG + comm.size - 1) // comm.size wa = min(world.rank * mynw, nw) wb = min(wa + mynw, nw) if self.blockcomm.size == 1: return chi0_wGG[wa:wb].copy() if self.kncomm.rank == 0: bg1 = BlacsGrid(comm, 1, comm.size) in_wGG = chi0_wGG.reshape((nw, -1)) else: bg1 = DryRunBlacsGrid(mpi.serial_comm, 1, 1) in_wGG = np.zeros((0, 0), complex) md1 = BlacsDescriptor(bg1, nw, nG**2, nw, mynG * nG) bg2 = BlacsGrid(world, world.size, 1) md2 = BlacsDescriptor(bg2, nw, nG**2, mynw, nG**2) r = Redistributor(world, md1, md2) shape = (wb - wa, nG, nG) out_wGG = np.empty(shape, complex) r.redistribute(in_wGG, out_wGG.reshape((wb - wa, nG**2))) return out_wGG
def tddft_init(self): if not self.tddft_initialized: if world.rank == 0: print('Initializing real time LCAO TD-DFT calculation.') print('XXX Warning: Array use not optimal for memory.') print('XXX Taylor propagator probably doesn\'t work') print('XXX ...and no arrays are listed in memory estimate yet.') self.blacs = self.wfs.ksl.using_blacs if self.blacs: self.ksl = ksl = self.wfs.ksl nao = ksl.nao nbands = ksl.bd.nbands mynbands = ksl.bd.mynbands blocksize = ksl.blocksize from gpaw.blacs import Redistributor if world.rank == 0: print('BLACS Parallelization') # Parallel grid descriptors self.MM_descriptor = ksl.blockgrid.new_descriptor(nao, nao, nao, nao) # FOR DEBUG self.mm_block_descriptor = ksl.blockgrid.new_descriptor(nao, nao, blocksize, blocksize) self.Cnm_block_descriptor = ksl.blockgrid.new_descriptor(nbands, nao, blocksize, blocksize) #self.CnM_descriptor = ksl.blockgrid.new_descriptor(nbands, nao, mynbands, nao) self.mM_column_descriptor = ksl.single_column_grid.new_descriptor(nao, nao, ksl.naoblocksize, nao) self.CnM_unique_descriptor = ksl.single_column_grid.new_descriptor(nbands, nao, mynbands, nao) # Redistributors self.mm2MM = Redistributor(ksl.block_comm, self.mm_block_descriptor, self.MM_descriptor) # XXX FOR DEBUG self.MM2mm = Redistributor(ksl.block_comm, self.MM_descriptor, self.mm_block_descriptor) # XXX FOR DEBUG self.Cnm2nM = Redistributor(ksl.block_comm, self.Cnm_block_descriptor, self.CnM_unique_descriptor) self.CnM2nm = Redistributor(ksl.block_comm, self.CnM_unique_descriptor, self.Cnm_block_descriptor) self.mM2mm = Redistributor(ksl.block_comm, self.mM_column_descriptor, self.mm_block_descriptor) for kpt in self.wfs.kpt_u: scalapack_zero(self.mm_block_descriptor, kpt.S_MM,'U') scalapack_zero(self.mm_block_descriptor, kpt.T_MM,'U') # XXX to propagator class if self.propagator == 'taylor' and self.blacs: # cholS_mm = self.mm_block_descriptor.empty(dtype=complex) for kpt in self.wfs.kpt_u: kpt.invS_MM = kpt.S_MM.copy() scalapack_inverse(self.mm_block_descriptor, kpt.invS_MM, 'L') if self.propagator_debug: if world.rank == 0: print('XXX Doing serial inversion of overlap matrix.') self.timer.start('Invert overlap (serial)') invS2_MM = self.MM_descriptor.empty(dtype=complex) for kpt in self.wfs.kpt_u: #kpt.S_MM[:] = 128.0*(2**world.rank) self.mm2MM.redistribute(self.wfs.S_qMM[kpt.q], invS2_MM) world.barrier() if world.rank == 0: tri2full(invS2_MM,'L') invS2_MM[:] = inv(invS2_MM.copy()) self.invS2_MM = invS2_MM kpt.invS2_MM = ksl.mmdescriptor.empty(dtype=complex) self.MM2mm.redistribute(invS2_MM, kpt.invS2_MM) verify(kpt.invS_MM, kpt.invS2_MM, 'overlap par. vs. serial', 'L') self.timer.stop('Invert overlap (serial)') if world.rank == 0: print('XXX Overlap inverted.') if self.propagator == 'taylor' and not self.blacs: tmp = inv(self.wfs.kpt_u[0].S_MM) self.wfs.kpt_u[0].invS = tmp # Reset the density mixer self.density.mixer = DummyMixer() self.tddft_initialized = True for k, kpt in enumerate(self.wfs.kpt_u): kpt.C2_nM = kpt.C_nM.copy()
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean HEA0 = np.ascontiguousarray(HEA0) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo='L', side='L') # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print('gemm err', gemm_err) print('gemv err', gemv_err) print('r2k err', r2k_err) print('rk_err', rk_err) print('hemm_err', hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)
def calculate_rkernel(self): gd = self.gd ng_c = gd.N_c cell_cv = gd.cell_cv icell_cv = 2 * np.pi * np.linalg.inv(cell_cv) vol = np.linalg.det(cell_cv) ns = self.calc.wfs.nspins n_g = self.n_g # density on rough grid fx_g = ns * self.get_fxc_g(n_g) # local exchange kernel qc_g = (-4 * np.pi * ns / fx_g)**0.5 # cutoff functional flocal_g = qc_g**3 * fx_g / (6 * np.pi**2) # ren. x-kernel for r=r' Vlocal_g = 2 * qc_g / np.pi # ren. Hartree kernel for r=r' ng = np.prod(ng_c) # number of grid points r_vg = gd.get_grid_point_coordinates() rx_g = r_vg[0].flatten() ry_g = r_vg[1].flatten() rz_g = r_vg[2].flatten() prnt(' %d grid points and %d plane waves at the Gamma point' % (ng, self.pd.ngmax), file=self.fd) # Unit cells R_Rv = [] weight_R = [] nR_v = self.unit_cells nR = np.prod(nR_v) for i in range(-nR_v[0] + 1, nR_v[0]): for j in range(-nR_v[1] + 1, nR_v[1]): for h in range(-nR_v[2] + 1, nR_v[2]): R_Rv.append(i * cell_cv[0] + j * cell_cv[1] + h * cell_cv[2]) weight_R.append((nR_v[0] - abs(i)) * (nR_v[1] - abs(j)) * (nR_v[2] - abs(h)) / float(nR)) if nR > 1: # with more than one unit cell only the exchange kernel is # calculated on the grid. The bare Coulomb kernel is added # in PW basis and Vlocal_g only the exchange part dv = self.calc.density.gd.dv gc = (3 * dv / 4 / np.pi)**(1 / 3.) Vlocal_g -= 2 * np.pi * gc**2 / dv prnt(' Lattice point sampling: ' + '(%s x %s x %s)^2 ' % (nR_v[0], nR_v[1], nR_v[2]) + ' Reduced to %s lattice points' % len(R_Rv), file=self.fd) l_g_size = -(-ng // mpi.world.size) l_g_range = range(mpi.world.rank * l_g_size, min((mpi.world.rank + 1) * l_g_size, ng)) fhxc_qsGr = {} for iq in range(len(self.ibzq_qc)): fhxc_qsGr[iq] = np.zeros( (ns, len(self.pd.G2_qG[iq]), len(l_g_range)), dtype=complex) inv_error = np.seterr() np.seterr(invalid='ignore') np.seterr(divide='ignore') t0 = time() # Loop over Lattice points for i, R_v in enumerate(R_Rv): # Loop over r'. f_rr and V_rr are functions of r (dim. as r_vg[0]) if i == 1: prnt(' Finished 1 cell in %s seconds' % int(time() - t0) + ' - estimated %s seconds left' % int( (len(R_Rv) - 1) * (time() - t0)), file=self.fd) self.fd.flush() if len(R_Rv) > 5: if (i + 1) % (len(R_Rv) / 5 + 1) == 0: prnt(' Finished %s cells in %s seconds' % (i, int(time() - t0)) + ' - estimated %s seconds left' % int( (len(R_Rv) - i) * (time() - t0) / i), file=self.fd) self.fd.flush() for g in l_g_range: rx = rx_g[g] + R_v[0] ry = ry_g[g] + R_v[1] rz = rz_g[g] + R_v[2] # |r-r'-R_i| rr = ((r_vg[0] - rx)**2 + (r_vg[1] - ry)**2 + (r_vg[2] - rz)**2)**0.5 n_av = (n_g + n_g.flatten()[g]) / 2. fx_g = ns * self.get_fxc_g(n_av, index=g) qc_g = (-4 * np.pi * ns / fx_g)**0.5 x = qc_g * rr osc_x = np.sin(x) - x * np.cos(x) f_rr = fx_g * osc_x / (2 * np.pi**2 * rr**3) if nR > 1: # include only exchange part of the kernel here V_rr = (sici(x)[0] * 2 / np.pi - 1) / rr else: # include the full kernel (also hartree part) V_rr = (sici(x)[0] * 2 / np.pi) / rr # Terms with r = r' if (np.abs(R_v) < 0.001).all(): tmp_flat = f_rr.flatten() tmp_flat[g] = flocal_g.flatten()[g] f_rr = tmp_flat.reshape(ng_c) tmp_flat = V_rr.flatten() tmp_flat[g] = Vlocal_g.flatten()[g] V_rr = tmp_flat.reshape(ng_c) del tmp_flat f_rr[np.where(n_av < self.density_cut)] = 0.0 V_rr[np.where(n_av < self.density_cut)] = 0.0 f_rr *= weight_R[i] V_rr *= weight_R[i] # r-r'-R_i r_r = np.array([r_vg[0] - rx, r_vg[1] - ry, r_vg[2] - rz]) # Fourier transform of r for iq, q in enumerate(self.ibzq_qc): q_v = np.dot(q, icell_cv) e_q = np.exp(-1j * gemmdot(q_v, r_r, beta=0.0)) f_q = self.pd.fft((f_rr + V_rr) * e_q, iq) * vol / ng fhxc_qsGr[iq][0, :, g - l_g_range[0]] += f_q if ns == 2: f_q = self.pd.fft(V_rr * e_q, iq) * vol / ng fhxc_qsGr[iq][1, :, g - l_g_range[0]] += f_q mpi.world.barrier() np.seterr(**inv_error) for iq, q in enumerate(self.ibzq_qc): npw = len(self.pd.G2_qG[iq]) fhxc_sGsG = np.zeros((ns * npw, ns * npw), complex) l_pw_size = -(-npw // mpi.world.size) # parallelize over PW below l_pw_range = range(mpi.world.rank * l_pw_size, min((mpi.world.rank + 1) * l_pw_size, npw)) if mpi.world.size > 1: # redistribute grid and plane waves in fhxc_qsGr[iq] bg1 = BlacsGrid(mpi.world, 1, mpi.world.size) bg2 = BlacsGrid(mpi.world, mpi.world.size, 1) bd1 = bg1.new_descriptor(npw, ng, npw, -(-ng / mpi.world.size)) bd2 = bg2.new_descriptor(npw, ng, -(-npw / mpi.world.size), ng) fhxc_Glr = np.zeros((len(l_pw_range), ng), dtype=complex) if ns == 2: Koff_Glr = np.zeros((len(l_pw_range), ng), dtype=complex) r = Redistributor(bg1.comm, bd1, bd2) r.redistribute(fhxc_qsGr[iq][0], fhxc_Glr, npw, ng) if ns == 2: r.redistribute(fhxc_qsGr[iq][1], Koff_Glr, npw, ng) else: fhxc_Glr = fhxc_qsGr[iq][0] if ns == 2: Koff_Glr = fhxc_qsGr[iq][1] # Fourier transform of r' for iG in range(len(l_pw_range)): f_g = fhxc_Glr[iG].reshape(ng_c) f_G = self.pd.fft(f_g.conj(), iq) * vol / ng fhxc_sGsG[l_pw_range[0] + iG, :npw] = f_G.conj() if ns == 2: v_g = Koff_Glr[iG].reshape(ng_c) v_G = self.pd.fft(v_g.conj(), iq) * vol / ng fhxc_sGsG[npw + l_pw_range[0] + iG, :npw] = v_G.conj() if ns == 2: # f_00 = f_11 and f_01 = f_10 fhxc_sGsG[:npw, npw:] = fhxc_sGsG[npw:, :npw] fhxc_sGsG[npw:, npw:] = fhxc_sGsG[:npw, :npw] mpi.world.sum(fhxc_sGsG) fhxc_sGsG /= vol if mpi.rank == 0: w = Writer('fhxc_%s_%s_%s_%s.gpw' % (self.tag, self.xc, self.ecut, iq)) w.dimension('sG', ns * npw) w.add('fhxc_sGsG', ('sG', 'sG'), dtype=complex) if nR > 1: # add Hartree kernel evaluated in PW basis Gq2_G = self.pd.G2_qG[iq] if (q == 0).all(): Gq2_G[0] = 1. vq_G = 4 * np.pi / Gq2_G fhxc_sGsG += np.tile(np.eye(npw) * vq_G, (ns, ns)) w.fill(fhxc_sGsG) w.close() mpi.world.barrier() prnt(file=self.fd)
def main(N=72, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype == complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: glob = grid.new_descriptor(N, N, N, N) # print globA.asarray() # Populate matrices local to master: H0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) S0 = glob.zeros(dtype=dtype) + gen.rand(*glob.shape) C0 = glob.empty(dtype=dtype) if rank == 0: # Complex case must have real numbers on the diagonal. # We make a simple complex Hermitian matrix below. H0 = H0 + epsilon * (0.1 * np.tri(N, N, k=-N // nprocs) + 0.3 * np.tri(N, N, k=-1)) S0 = S0 + epsilon * (0.2 * np.tri(N, N, k=-N // nprocs) + 0.4 * np.tri(N, N, k=-1)) # Make matrices symmetric rk(1.0, H0.copy(), 0.0, H0) rk(1.0, S0.copy(), 0.0, S0) # Overlap matrix must be semi-positive definite S0 = S0 + 50.0 * np.eye(N, N, 0) # Hamiltonian is usually diagonally dominant H0 = H0 + 75.0 * np.eye(N, N, 0) C0 = S0.copy() S0_inv = S0.copy() # Local result matrices W0 = np.empty((N), dtype=float) W0_g = np.empty((N), dtype=float) # Calculate eigenvalues / other serial results if rank == 0: diagonalize(H0.copy(), W0) general_diagonalize(H0.copy(), W0_g, S0.copy()) inverse_cholesky(C0) # result returned in lower triangle tri2full(S0_inv, 'L') S0_inv = inv(S0_inv) # tri2full(C0) # symmetrize assert glob.check(H0) and glob.check(S0) and glob.check(C0) # Create distributed destriptors with various block sizes: dist = grid.new_descriptor(N, N, 8, 8) # Distributed matrices: # We can use empty here, but end up with garbage on # on the other half of the triangle when we redistribute. # This is fine because ScaLAPACK does not care. H = dist.empty(dtype=dtype) S = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) Z = dist.empty(dtype=dtype) C = dist.empty(dtype=dtype) Sinv = dist.empty(dtype=dtype) # Eigenvalues are non-BLACS matrices W = np.empty((N), dtype=float) W_dc = np.empty((N), dtype=float) W_mr3 = np.empty((N), dtype=float) W_g = np.empty((N), dtype=float) W_g_dc = np.empty((N), dtype=float) W_g_mr3 = np.empty((N), dtype=float) Glob2dist = Redistributor(world, glob, dist) Glob2dist.redistribute(H0, H, uplo='L') Glob2dist.redistribute(S0, S, uplo='L') Glob2dist.redistribute(S0, C, uplo='L') # C0 was previously overwritten Glob2dist.redistribute(S0, Sinv, uplo='L') # we don't test the expert drivers anymore since there # might be a buffer overflow error ## scalapack_diagonalize_ex(dist, H.copy(), Z, W, 'L') scalapack_diagonalize_dc(dist, H.copy(), Z, W_dc, 'L') ## scalapack_diagonalize_mr3(dist, H.copy(), Z, W_mr3, 'L') ## scalapack_general_diagonalize_ex(dist, H.copy(), S.copy(), Z, W_g, 'L') scalapack_general_diagonalize_dc(dist, H.copy(), S.copy(), Z, W_g_dc, 'L') ## scalapack_general_diagonalize_mr3(dist, H.copy(), S.copy(), Z, W_g_mr3, 'L') scalapack_inverse_cholesky(dist, C, 'L') if dtype == complex: # Only supported for complex for now scalapack_inverse(dist, Sinv, 'L') # Undo redistribute C_test = glob.empty(dtype=dtype) Sinv_test = glob.empty(dtype=dtype) Dist2glob = Redistributor(world, dist, glob) Dist2glob.redistribute(C, C_test) Dist2glob.redistribute(Sinv, Sinv_test) if rank == 0: ## diag_ex_err = abs(W - W0).max() diag_dc_err = abs(W_dc - W0).max() ## diag_mr3_err = abs(W_mr3 - W0).max() ## general_diag_ex_err = abs(W_g - W0_g).max() general_diag_dc_err = abs(W_g_dc - W0_g).max() ## general_diag_mr3_err = abs(W_g_mr3 - W0_g).max() inverse_chol_err = abs(C_test - C0).max() tri2full(Sinv_test, 'L') inverse_err = abs(Sinv_test - S0_inv).max() ## print 'diagonalize ex err', diag_ex_err print('diagonalize dc err', diag_dc_err) ## print 'diagonalize mr3 err', diag_mr3_err ## print 'general diagonalize ex err', general_diag_ex_err print('general diagonalize dc err', general_diag_dc_err) ## print 'general diagonalize mr3 err', general_diag_mr3_err print('inverse chol err', inverse_chol_err) if dtype == complex: print('inverse err', inverse_err) else: ## diag_ex_err = 0.0 diag_dc_err = 0.0 ## diag_mr3_err = 0.0 ## general_diag_ex_err = 0.0 general_diag_dc_err = 0.0 ## general_diag_mr3_err = 0.0 inverse_chol_err = 0.0 inverse_err = 0.0 # We don't like exceptions on only one cpu ## diag_ex_err = world.sum(diag_ex_err) diag_dc_err = world.sum(diag_dc_err) ## diag_mr3_err = world.sum(diag_mr3_err) ## general_diag_ex_err = world.sum(general_diag_ex_err) general_diag_dc_err = world.sum(general_diag_dc_err) ## general_diag_mr3_err = world.sum(general_diag_mr3_err) inverse_chol_err = world.sum(inverse_chol_err) inverse_err = world.sum(inverse_err) ## assert diag_ex_err < tol assert diag_dc_err < tol ## assert diag_mr3_err < tol ## assert general_diag_ex_err < tol assert general_diag_dc_err < tol ## assert general_diag_mr3_err < tol assert inverse_chol_err < tol if dtype == complex: assert inverse_err < tol
def diagonalize_full_hamiltonian(self, ham, atoms, occupations, txt, nbands=None, scalapack=None, expert=False): assert self.dtype == complex if nbands is None: nbands = self.pd.ngmin // self.bd.comm.size * self.bd.comm.size else: assert nbands <= self.pd.ngmin if expert: iu = nbands else: iu = None self.bd = bd = BandDescriptor(nbands, self.bd.comm) p = functools.partial(print, file=txt) p('Diagonalizing full Hamiltonian ({0} lowest bands)'.format(nbands)) p('Matrix size (min, max): {0}, {1}'.format(self.pd.ngmin, self.pd.ngmax)) mem = 3 * self.pd.ngmax**2 * 16 / bd.comm.size / 1024**2 p('Approximate memory usage per core: {0:.3f} MB'.format(mem)) if bd.comm.size > 1: if isinstance(scalapack, (list, tuple)): nprow, npcol, b = scalapack else: nprow = int(round(bd.comm.size**0.5)) while bd.comm.size % nprow != 0: nprow -= 1 npcol = bd.comm.size // nprow b = 64 p('ScaLapack grid: {0}x{1},'.format(nprow, npcol), 'block-size:', b) bg = BlacsGrid(bd.comm, bd.comm.size, 1) bg2 = BlacsGrid(bd.comm, nprow, npcol) scalapack = True else: nprow = npcol = 1 scalapack = False self.pt.set_positions(atoms.get_scaled_positions()) self.kpt_u[0].P_ani = None self.allocate_arrays_for_projections(self.pt.my_atom_indices) myslice = bd.get_slice() pb = ProgressBar(txt) nkpt = len(self.kpt_u) for u, kpt in enumerate(self.kpt_u): pb.update(u / nkpt) npw = len(self.pd.Q_qG[kpt.q]) if scalapack: mynpw = -(-npw // bd.comm.size) md = BlacsDescriptor(bg, npw, npw, mynpw, npw) md2 = BlacsDescriptor(bg2, npw, npw, b, b) else: md = md2 = MatrixDescriptor(npw, npw) with self.timer('Build H and S'): H_GG, S_GG = self.hs(ham, kpt.q, kpt.s, md) if scalapack: r = Redistributor(bd.comm, md, md2) H_GG = r.redistribute(H_GG) S_GG = r.redistribute(S_GG) psit_nG = md2.empty(dtype=complex) eps_n = np.empty(npw) with self.timer('Diagonalize'): if not scalapack: md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n, iu=iu) else: md2.general_diagonalize_dc(H_GG, S_GG, psit_nG, eps_n) del H_GG, S_GG kpt.eps_n = eps_n[myslice].copy() if scalapack: md3 = BlacsDescriptor(bg, npw, npw, bd.mynbands, npw) r = Redistributor(bd.comm, md2, md3) psit_nG = r.redistribute(psit_nG) kpt.psit_nG = psit_nG[:bd.mynbands].copy() del psit_nG with self.timer('Projections'): self.pt.integrate(kpt.psit_nG, kpt.P_ani, kpt.q) kpt.f_n = None pb.finish() occupations.calculate(self)
def calculate_forces(self, hamiltonian, F_av): self.timer.start('LCAO forces') spos_ac = self.tci.atoms.get_scaled_positions() % 1.0 ksl = self.ksl nao = ksl.nao mynao = ksl.mynao nq = len(self.kd.ibzk_qc) dtype = self.dtype tci = self.tci gd = self.gd bfs = self.basis_functions Mstart = ksl.Mstart Mstop = ksl.Mstop from gpaw.kohnsham_layouts import BlacsOrbitalLayouts isblacs = isinstance(ksl, BlacsOrbitalLayouts) # XXX if not isblacs: self.timer.start('TCI derivative') dThetadR_qvMM = np.empty((nq, 3, mynao, nao), dtype) dTdR_qvMM = np.empty((nq, 3, mynao, nao), dtype) dPdR_aqvMi = {} for a in self.basis_functions.my_atom_indices: ni = self.setups[a].ni dPdR_aqvMi[a] = np.empty((nq, 3, nao, ni), dtype) tci.calculate_derivative(spos_ac, dThetadR_qvMM, dTdR_qvMM, dPdR_aqvMi) gd.comm.sum(dThetadR_qvMM) gd.comm.sum(dTdR_qvMM) self.timer.stop('TCI derivative') my_atom_indices = bfs.my_atom_indices atom_indices = bfs.atom_indices def _slices(indices): for a in indices: M1 = bfs.M_a[a] - Mstart M2 = M1 + self.setups[a].nao if M2 > 0: yield a, max(0, M1), M2 def slices(): return _slices(atom_indices) def my_slices(): return _slices(my_atom_indices) # # ----- ----- # \ -1 \ * # E = ) S H rho = ) c eps f c # mu nu / mu x x z z nu / n mu n n n nu # ----- ----- # x z n # # We use the transpose of that matrix. The first form is used # if rho is given, otherwise the coefficients are used. self.timer.start('Initial') rhoT_uMM = [] ET_uMM = [] if not isblacs: if self.kpt_u[0].rho_MM is None: self.timer.start('Get density matrix') for kpt in self.kpt_u: rhoT_MM = ksl.get_transposed_density_matrix( kpt.f_n, kpt.C_nM) rhoT_uMM.append(rhoT_MM) ET_MM = ksl.get_transposed_density_matrix( kpt.f_n * kpt.eps_n, kpt.C_nM) ET_uMM.append(ET_MM) if hasattr(kpt, 'c_on'): # XXX does this work with BLACS/non-BLACS/etc.? assert self.bd.comm.size == 1 d_nn = np.zeros((self.bd.mynbands, self.bd.mynbands), dtype=kpt.C_nM.dtype) for ne, c_n in zip(kpt.ne_o, kpt.c_on): d_nn += ne * np.outer(c_n.conj(), c_n) rhoT_MM += ksl.get_transposed_density_matrix_delta(\ d_nn, kpt.C_nM) ET_MM += ksl.get_transposed_density_matrix_delta(\ d_nn * kpt.eps_n, kpt.C_nM) self.timer.stop('Get density matrix') else: rhoT_uMM = [] ET_uMM = [] for kpt in self.kpt_u: H_MM = self.eigensolver.calculate_hamiltonian_matrix(\ hamiltonian, self, kpt) tri2full(H_MM) S_MM = kpt.S_MM.copy() tri2full(S_MM) ET_MM = np.linalg.solve(S_MM, gemmdot(H_MM, kpt.rho_MM)).T.copy() del S_MM, H_MM rhoT_MM = kpt.rho_MM.T.copy() rhoT_uMM.append(rhoT_MM) ET_uMM.append(ET_MM) self.timer.stop('Initial') if isblacs: # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX from gpaw.blacs import BlacsGrid, Redistributor def get_density_matrix(f_n, C_nM, redistributor): rho1_mm = ksl.calculate_blocked_density_matrix(f_n, C_nM).conj() rho_mm = redistributor.redistribute(rho1_mm) return rho_mm pcutoff_a = [ max([pt.get_cutoff() for pt in setup.pt_j]) for setup in self.setups ] phicutoff_a = [ max([phit.get_cutoff() for phit in setup.phit_j]) for setup in self.setups ] # XXX should probably use bdsize x gdsize instead # That would be consistent with some existing grids grid = BlacsGrid(ksl.block_comm, self.gd.comm.size, self.bd.comm.size) blocksize1 = -(-nao // grid.nprow) blocksize2 = -(-nao // grid.npcol) # XXX what are rows and columns actually? desc = grid.new_descriptor(nao, nao, blocksize1, blocksize2) rhoT_umm = [] ET_umm = [] redistributor = Redistributor(grid.comm, ksl.mmdescriptor, desc) Fpot_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): self.timer.start('Get density matrix') rhoT_mm = get_density_matrix(kpt.f_n, kpt.C_nM, redistributor) rhoT_umm.append(rhoT_mm) self.timer.stop('Get density matrix') self.timer.start('Potential') rhoT_mM = ksl.distribute_to_columns(rhoT_mm, desc) vt_G = hamiltonian.vt_sG[kpt.s] Fpot_av += bfs.calculate_force_contribution( vt_G, rhoT_mM, kpt.q) del rhoT_mM self.timer.stop('Potential') self.timer.start('Get density matrix') for kpt in self.kpt_u: ET_mm = get_density_matrix(kpt.f_n * kpt.eps_n, kpt.C_nM, redistributor) ET_umm.append(ET_mm) self.timer.stop('Get density matrix') M1start = blocksize1 * grid.myrow M2start = blocksize2 * grid.mycol M1stop = min(M1start + blocksize1, nao) M2stop = min(M2start + blocksize2, nao) m1max = M1stop - M1start m2max = M2stop - M2start if not isblacs: # Kinetic energy contribution # # ----- d T # a \ mu nu # F += 2 Re ) -------- rho # / d R nu mu # ----- mu nu # mu in a; nu # Fkin_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): dEdTrhoT_vMM = (dTdR_qvMM[kpt.q] * rhoT_uMM[u][np.newaxis]).real for a, M1, M2 in my_slices(): Fkin_av[a, :] += \ 2.0 * dEdTrhoT_vMM[:, M1:M2].sum(-1).sum(-1) del dEdTrhoT_vMM # Density matrix contribution due to basis overlap # # ----- d Theta # a \ mu nu # F += -2 Re ) ------------ E # / d R nu mu # ----- mu nu # mu in a; nu # Ftheta_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): dThetadRE_vMM = (dThetadR_qvMM[kpt.q] * ET_uMM[u][np.newaxis]).real for a, M1, M2 in my_slices(): Ftheta_av[a, :] += \ -2.0 * dThetadRE_vMM[:, M1:M2].sum(-1).sum(-1) del dThetadRE_vMM if isblacs: from gpaw.lcao.overlap import TwoCenterIntegralCalculator self.timer.start('Prepare TCI loop') M_a = bfs.M_a Fkin2_av = np.zeros_like(F_av) Ftheta2_av = np.zeros_like(F_av) cell_cv = tci.atoms.cell spos_ac = tci.atoms.get_scaled_positions() % 1.0 overlapcalc = TwoCenterIntegralCalculator(self.kd.ibzk_qc, derivative=False) # XXX this is not parallel *AT ALL*. self.timer.start('Get neighbors') nl = tci.atompairs.pairs.neighbors r_and_offset_aao = get_r_and_offsets(nl, spos_ac, cell_cv) atompairs = r_and_offset_aao.keys() atompairs.sort() self.timer.stop('Get neighbors') T_expansions = tci.T_expansions Theta_expansions = tci.Theta_expansions P_expansions = tci.P_expansions nq = len(self.kd.ibzk_qc) dH_asp = hamiltonian.dH_asp self.timer.start('broadcast dH') alldH_asp = {} for a in range(len(self.setups)): gdrank = bfs.sphere_a[a].rank if gdrank == gd.rank: dH_sp = dH_asp[a] else: ni = self.setups[a].ni dH_sp = np.empty((self.nspins, ni * (ni + 1) // 2)) gd.comm.broadcast(dH_sp, gdrank) # okay, now everyone gets copies of dH_sp alldH_asp[a] = dH_sp self.timer.stop('broadcast dH') # This will get sort of hairy. We need to account for some # three-center overlaps, such as: # # a1 # Phi ~a3 a3 ~a3 a2 a2,a1 # < ---- |p > dH <p |Phi > rho # dR # # To this end we will loop over all pairs of atoms (a1, a3), # and then a sub-loop over (a3, a2). from gpaw.lcao.overlap import DerivativeAtomicDisplacement class Displacement(DerivativeAtomicDisplacement): def __init__(self, a1, a2, R_c, offset): phases = overlapcalc.phaseclass(overlapcalc.ibzk_qc, offset) DerivativeAtomicDisplacement.__init__( self, None, a1, a2, R_c, offset, phases) # Cache of Displacement objects with spherical harmonics with # evaluated spherical harmonics. disp_aao = {} def get_displacements(a1, a2, maxdistance): # XXX the way maxdistance is handled it can lead to # bad caching when different maxdistances are passed # to subsequent calls with same pair of atoms disp_o = disp_aao.get((a1, a2)) if disp_o is None: disp_o = [] for R_c, offset in r_and_offset_aao[(a1, a2)]: if np.linalg.norm(R_c) > maxdistance: continue disp = Displacement(a1, a2, R_c, offset) disp_o.append(disp) disp_aao[(a1, a2)] = disp_o return [disp for disp in disp_o if disp.r < maxdistance] self.timer.stop('Prepare TCI loop') self.timer.start('Not so complicated loop') for (a1, a2) in atompairs: if a1 >= a2: # Actually this leads to bad load balance. # We should take a1 > a2 or a1 < a2 equally many times. # Maybe decide which of these choices # depending on whether a2 % 1 == 0 continue m1start = M_a[a1] - M1start m2start = M_a[a2] - M2start if m1start >= blocksize1 or m2start >= blocksize2: continue # (we have only one block per CPU) T_expansion = T_expansions.get(a1, a2) Theta_expansion = Theta_expansions.get(a1, a2) #P_expansion = P_expansions.get(a1, a2) nm1, nm2 = T_expansion.shape m1stop = min(m1start + nm1, m1max) m2stop = min(m2start + nm2, m2max) if m1stop <= 0 or m2stop <= 0: continue m1start = max(m1start, 0) m2start = max(m2start, 0) J1start = max(0, M1start - M_a[a1]) J2start = max(0, M2start - M_a[a2]) M1stop = J1start + m1stop - m1start J2stop = J2start + m2stop - m2start dTdR_qvmm = T_expansion.zeros((nq, 3), dtype=dtype) dThetadR_qvmm = Theta_expansion.zeros((nq, 3), dtype=dtype) disp_o = get_displacements(a1, a2, phicutoff_a[a1] + phicutoff_a[a2]) for disp in disp_o: disp.evaluate_overlap(T_expansion, dTdR_qvmm) disp.evaluate_overlap(Theta_expansion, dThetadR_qvmm) for u, kpt in enumerate(self.kpt_u): rhoT_mm = rhoT_umm[u][m1start:m1stop, m2start:m2stop] ET_mm = ET_umm[u][m1start:m1stop, m2start:m2stop] Fkin_v = 2.0 * ( dTdR_qvmm[kpt.q][:, J1start:M1stop, J2start:J2stop] * rhoT_mm[np.newaxis]).real.sum(-1).sum(-1) Ftheta_v = 2.0 * (dThetadR_qvmm[kpt.q][:, J1start:M1stop, J2start:J2stop] * ET_mm[np.newaxis]).real.sum(-1).sum(-1) Fkin2_av[a1] += Fkin_v Fkin2_av[a2] -= Fkin_v Ftheta2_av[a1] -= Ftheta_v Ftheta2_av[a2] += Ftheta_v Fkin_av = Fkin2_av Ftheta_av = Ftheta2_av self.timer.stop('Not so complicated loop') dHP_and_dSP_aauim = {} a2values = {} for (a2, a3) in atompairs: if not a3 in a2values: a2values[a3] = [] a2values[a3].append(a2) Fatom_av = np.zeros_like(F_av) Frho_av = np.zeros_like(F_av) self.timer.start('Complicated loop') for a1, a3 in atompairs: if a1 == a3: # Functions reside on same atom, so their overlap # does not change when atom is displaced continue m1start = M_a[a1] - M1start if m1start >= blocksize1: continue P_expansion = P_expansions.get(a1, a3) nm1 = P_expansion.shape[0] m1stop = min(m1start + nm1, m1max) if m1stop <= 0: continue m1start = max(m1start, 0) J1start = max(0, M1start - M_a[a1]) J1stop = J1start + m1stop - m1start disp_o = get_displacements(a1, a3, phicutoff_a[a1] + pcutoff_a[a3]) if len(disp_o) == 0: continue dPdR_qvmi = P_expansion.zeros((nq, 3), dtype=dtype) for disp in disp_o: disp.evaluate_overlap(P_expansion, dPdR_qvmi) dPdR_qvmi = dPdR_qvmi[:, :, J1start:J1stop, :].copy() for a2 in a2values[a3]: m2start = M_a[a2] - M2start if m2start >= blocksize2: continue P_expansion2 = P_expansions.get(a2, a3) nm2 = P_expansion2.shape[0] m2stop = min(m2start + nm2, m2max) if m2stop <= 0: continue disp_o = get_displacements(a2, a3, phicutoff_a[a2] + pcutoff_a[a3]) if len(disp_o) == 0: continue m2start = max(m2start, 0) J2start = max(0, M2start - M_a[a2]) J2stop = J2start + m2stop - m2start if (a2, a3) in dHP_and_dSP_aauim: dHP_uim, dSP_uim = dHP_and_dSP_aauim[(a2, a3)] else: P_qmi = P_expansion2.zeros((nq, ), dtype=dtype) for disp in disp_o: disp.evaluate_direct(P_expansion2, P_qmi) P_qmi = P_qmi[:, J2start:J2stop].copy() dH_sp = alldH_asp[a3] dS_ii = self.setups[a3].dO_ii dHP_uim = [] dSP_uim = [] for u, kpt in enumerate(self.kpt_u): dH_ii = unpack(dH_sp[kpt.s]) dHP_im = np.dot(P_qmi[kpt.q], dH_ii).T.conj() # XXX only need nq of these dSP_im = np.dot(P_qmi[kpt.q], dS_ii).T.conj() dHP_uim.append(dHP_im) dSP_uim.append(dSP_im) dHP_and_dSP_aauim[(a2, a3)] = dHP_uim, dSP_uim for u, kpt in enumerate(self.kpt_u): rhoT_mm = rhoT_umm[u][m1start:m1stop, m2start:m2stop] ET_mm = ET_umm[u][m1start:m1stop, m2start:m2stop] dPdRdHP_vmm = np.dot(dPdR_qvmi[kpt.q], dHP_uim[u]) dPdRdSP_vmm = np.dot(dPdR_qvmi[kpt.q], dSP_uim[u]) Fatom_c = 2.0 * (dPdRdHP_vmm * rhoT_mm).real.sum(-1).sum(-1) Frho_c = 2.0 * (dPdRdSP_vmm * ET_mm).real.sum(-1).sum(-1) Fatom_av[a1] += Fatom_c Fatom_av[a3] -= Fatom_c Frho_av[a1] -= Frho_c Frho_av[a3] += Frho_c self.timer.stop('Complicated loop') if not isblacs: # Potential contribution # # ----- / d Phi (r) # a \ | mu ~ # F += -2 Re ) | ---------- v (r) Phi (r) dr rho # / | d R nu nu mu # ----- / a # mu in a; nu # self.timer.start('Potential') Fpot_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): vt_G = hamiltonian.vt_sG[kpt.s] Fpot_av += bfs.calculate_force_contribution( vt_G, rhoT_uMM[u], kpt.q) self.timer.stop('Potential') # Density matrix contribution from PAW correction # # ----- ----- # a \ a \ b # F += 2 Re ) Z E - 2 Re ) Z E # / mu nu nu mu / mu nu nu mu # ----- ----- # mu nu b; mu in a; nu # # with # b* # ----- dP # b \ i mu b b # Z = ) -------- dS P # mu nu / dR ij j nu # ----- b mu # ij # self.timer.start('Paw correction') Frho_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): work_MM = np.zeros((mynao, nao), dtype) ZE_MM = None for b in my_atom_indices: setup = self.setups[b] dO_ii = np.asarray(setup.dO_ii, dtype) dOP_iM = np.zeros((setup.ni, nao), dtype) gemm(1.0, self.P_aqMi[b][kpt.q], dO_ii, 0.0, dOP_iM, 'c') for v in range(3): gemm(1.0, dOP_iM, dPdR_aqvMi[b][kpt.q][v][Mstart:Mstop], 0.0, work_MM, 'n') ZE_MM = (work_MM * ET_uMM[u]).real for a, M1, M2 in slices(): dE = 2 * ZE_MM[M1:M2].sum() Frho_av[a, v] -= dE # the "b; mu in a; nu" term Frho_av[b, v] += dE # the "mu nu" term del work_MM, ZE_MM self.timer.stop('Paw correction') # Atomic density contribution # ----- ----- # a \ a \ b # F += -2 Re ) A rho + 2 Re ) A rho # / mu nu nu mu / mu nu nu mu # ----- ----- # mu nu b; mu in a; nu # # b* # ----- d P # b \ i mu b b # A = ) ------- dH P # mu nu / d R ij j nu # ----- b mu # ij # self.timer.start('Atomic Hamiltonian force') Fatom_av = np.zeros_like(F_av) for u, kpt in enumerate(self.kpt_u): for b in my_atom_indices: H_ii = np.asarray(unpack(hamiltonian.dH_asp[b][kpt.s]), dtype) HP_iM = gemmdot( H_ii, np.ascontiguousarray(self.P_aqMi[b][kpt.q].T.conj())) for v in range(3): dPdR_Mi = dPdR_aqvMi[b][kpt.q][v][Mstart:Mstop] ArhoT_MM = (gemmdot(dPdR_Mi, HP_iM) * rhoT_uMM[u]).real for a, M1, M2 in slices(): dE = 2 * ArhoT_MM[M1:M2].sum() Fatom_av[a, v] += dE # the "b; mu in a; nu" term Fatom_av[b, v] -= dE # the "mu nu" term self.timer.stop('Atomic Hamiltonian force') F_av += Fkin_av + Fpot_av + Ftheta_av + Frho_av + Fatom_av self.timer.start('Wait for sum') ksl.orbital_comm.sum(F_av) if self.bd.comm.rank == 0: self.kd.comm.sum(F_av, 0) self.timer.stop('Wait for sum') self.timer.stop('LCAO forces')
def __init__(self, sl_lrtddft, nrows, lr_comms): self.mprocs, self.nprocs, self.block_size = tuple(sl_lrtddft) self.lr_comms = lr_comms # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # # ----------------------------------------------------------------- # matrix # original grid, ie, how matrix is stored self.orig_matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.lr_comms.dd_comm.size, self.lr_comms.eh_comm.size) # solve grid self.solve_matrix_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # M = rows, N = cols M = nrows * 4 N = nrows * 4 mb = 4 nb = 4 self.orig_matrix_descr = self.orig_matrix_grid.new_descriptor( N, M, nb, mb) bs = self.block_size self.solve_matrix_descr = self.solve_matrix_grid.new_descriptor( N, M, bs, bs) self.matrix_in_redist = Redistributor(self.lr_comms.parent_comm, self.orig_matrix_descr, self.solve_matrix_descr) # ----------------------------------------------------------------- # vector # original grid, ie, how vector is stored self.orig_vector_grid = BlacsGrid( self.lr_comms.parent_comm, 1, (self.lr_comms.dd_comm.size * self.lr_comms.eh_comm.size)) # solve grid #self.solve_vector_grid = BlacsGrid(self.lr_comms.parent_comm, self.mprocs, self.nprocs) # M = rows, N = cols M = nrows * 4 Nrhs = 1 mb = 4 nb = 1 self.orig_vector_descr = self.orig_vector_grid.new_descriptor( Nrhs, M, nb, mb) bs = self.block_size self.solve_vector_descr = self.solve_matrix_grid.new_descriptor( Nrhs, M, 1, bs) self.vector_in_redist = Redistributor(self.lr_comms.parent_comm, self.orig_vector_descr, self.solve_vector_descr) self.vector_out_redist = Redistributor(self.lr_comms.parent_comm, self.solve_vector_descr, self.orig_vector_descr)
def __init__(self, sl_lrtddft, nkq, dd_comm, eh_comm): mcpus, ncpus, blocksize = tuple(sl_lrtddft) self.world = eh_comm.parent self.dd_comm = dd_comm if self.world is None: self.world = self.dd_comm # All the ranks within domain communicator contain the omega matrix # construct new communicator only on domain masters eh_ranks = np.arange(eh_comm.size) * dd_comm.size self.eh_comm2 = self.world.new_communicator(eh_ranks) self.eh_grid = BlacsGrid(self.eh_comm2, eh_comm.size, 1) self.eh_descr = self.eh_grid.new_descriptor(nkq, nkq, 1, nkq) self.diag_grid = BlacsGrid(self.world, mcpus, ncpus) self.diag_descr = self.diag_grid.new_descriptor( nkq, nkq, blocksize, blocksize) self.redistributor_in = Redistributor(self.world, self.eh_descr, self.diag_descr) self.redistributor_out = Redistributor(self.world, self.diag_descr, self.eh_descr) """ # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # ----------------------------------------------------------------- # M = rows, N = cols M = nkq*4; N = nkq*4; mb = nkq*4; nb = 4; Nrhs = 1 # Matrix, mp=1, np=eh_comm.size self.eh_grid2a = BlacsGrid(self.eh_comm2, eh_comm.size, 1) # Vector, mp=eh_comm.size, np=1 self.eh_grid2b = BlacsGrid(self.eh_comm2, 1, eh_comm.size) self.eh_descr2a = self.eh_grid2a.new_descriptor(N, M, nb, mb) self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N, 1, nb) self.solve_descr2a =self.diag_grid.new_descriptor(N, M, blocksize, blocksize) self.solve_descr2b =self.diag_grid.new_descriptor(Nrhs, N, 1, blocksize) self.redist_solve_in_2a = Redistributor(self.world, self.eh_descr2a, self.solve_descr2a) self.redist_solve_in_2b = Redistributor(self.world, self.eh_descr2b, self.solve_descr2b) self.redist_solve_out_2a = Redistributor(self.world, self.solve_descr2a, self.eh_descr2a) self.redist_solve_out_2b = Redistributor(self.world, self.solve_descr2b, self.eh_descr2b) """ # ----------------------------------------------------------------- # for SCALAPACK we need TRANSPOSED MATRIX (and vector) # ----------------------------------------------------------------- # M = rows, N = cols M = nkq * 4 N = nkq * 4 mb = 4 nb = 4 Nrhs = 1 # Matrix, mp=1, np=eh_comm.size self.eh_grid2a = BlacsGrid(self.world, dd_comm.size, eh_comm.size) # Vector, mp=eh_comm.size, np=1 self.eh_grid2b = BlacsGrid(self.world, 1, dd_comm.size * eh_comm.size) self.eh_descr2a = self.eh_grid2a.new_descriptor(N, M, nb, mb) self.eh_descr2b = self.eh_grid2b.new_descriptor(Nrhs, N, Nrhs, nb) self.solve_descr2a = self.diag_grid.new_descriptor( N, M, blocksize, blocksize) self.solve_descr2b = self.diag_grid.new_descriptor( Nrhs, N, Nrhs, blocksize) self.redist_solve_in_2a = Redistributor(self.world, self.eh_descr2a, self.solve_descr2a) self.redist_solve_in_2b = Redistributor(self.world, self.eh_descr2b, self.solve_descr2b) self.redist_solve_out_2a = Redistributor(self.world, self.solve_descr2a, self.eh_descr2a) self.redist_solve_out_2b = Redistributor(self.world, self.solve_descr2b, self.eh_descr2b)
def calculate_blocked_density_matrix(self, f_n, C_nM): nbands = self.bd.nbands nao = self.nao dtype = C_nM.dtype self.nMdescriptor.checkassert(C_nM) if self.gd.rank == 0: Cf_nM = (C_nM * f_n[:, None]) else: C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) r = Redistributor(self.block_comm, self.nM_unique_descriptor, self.mmdescriptor) Cf_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(Cf_nM, Cf_mm, nbands, nao) del Cf_nM C_mm = self.mmdescriptor.zeros(dtype=dtype) r.redistribute(C_nM, C_mm, nbands, nao) # no use to delete C_nM as it's in the input... rho_mm = self.mmdescriptor.zeros(dtype=dtype) if 1: # if self.libelpa is None: pblas_simple_gemm(self.mmdescriptor, self.mmdescriptor, self.mmdescriptor, Cf_mm, C_mm, rho_mm, transa='C') else: # elpa_hermitian_multiply was not faster than the ordinary # multiplication in the test. The way we have things distributed, # we need to transpose things at the moment. # # Rather than enabling this, we should store the coefficients # in an appropriate 2D block cyclic format (c_nm) and not the # current C_nM format. This makes it possible to avoid # redistributing the coefficients at all. But we don't have time # to implement this at the moment. mul = self.libelpa.hermitian_multiply desc = self.mmdescriptor from gpaw.utilities.scalapack import pblas_tran def T(array): tmp = array.copy() pblas_tran(alpha=1.0, a_MN=tmp, beta=0.0, c_NM=array, desca=desc, descc=desc) T(C_mm) T(Cf_mm) mul(C_mm, Cf_mm, rho_mm, desc, desc, desc, uplo_a='X', uplo_c='X') return rho_mm
def tddft_init(self): if self.tddft_initialized: return self.blacs = self.wfs.ksl.using_blacs if self.blacs: self.ksl = ksl = self.wfs.ksl nao = ksl.nao nbands = ksl.bd.nbands mynbands = ksl.bd.mynbands blocksize = ksl.blocksize from gpaw.blacs import Redistributor if self.wfs.world.rank == 0: print('BLACS Parallelization') # Parallel grid descriptors grid = ksl.blockgrid assert grid.nprow * grid.npcol == self.wfs.ksl.block_comm.size # FOR DEBUG self.MM_descriptor = grid.new_descriptor(nao, nao, nao, nao) self.mm_block_descriptor = grid.new_descriptor( nao, nao, blocksize, blocksize) self.Cnm_block_descriptor = grid.new_descriptor( nbands, nao, blocksize, blocksize) # self.CnM_descriptor = ksl.blockgrid.new_descriptor(nbands, # nao, mynbands, nao) self.mM_column_descriptor = ksl.single_column_grid.new_descriptor( nao, nao, ksl.naoblocksize, nao) self.CnM_unique_descriptor = ksl.single_column_grid.new_descriptor( nbands, nao, mynbands, nao) # Redistributors self.mm2MM = Redistributor(ksl.block_comm, self.mm_block_descriptor, self.MM_descriptor) # XXX FOR DEBUG self.MM2mm = Redistributor(ksl.block_comm, self.MM_descriptor, self.mm_block_descriptor) # FOR DEBUG self.Cnm2nM = Redistributor(ksl.block_comm, self.Cnm_block_descriptor, self.CnM_unique_descriptor) self.CnM2nm = Redistributor(ksl.block_comm, self.CnM_unique_descriptor, self.Cnm_block_descriptor) self.mM2mm = Redistributor(ksl.block_comm, self.mM_column_descriptor, self.mm_block_descriptor) for kpt in self.wfs.kpt_u: scalapack_zero(self.mm_block_descriptor, kpt.S_MM, 'U') scalapack_zero(self.mm_block_descriptor, kpt.T_MM, 'U') # XXX to propagator class if self.propagator == 'taylor' and self.blacs: # cholS_mm = self.mm_block_descriptor.empty(dtype=complex) for kpt in self.wfs.kpt_u: kpt.invS_MM = kpt.S_MM.copy() scalapack_inverse(self.mm_block_descriptor, kpt.invS_MM, 'L') if self.propagator == 'taylor' and not self.blacs: tmp = inv(self.wfs.kpt_u[0].S_MM) self.wfs.kpt_u[0].invS = tmp # Reset the density mixer self.density.set_mixer(DummyMixer()) self.tddft_initialized = True for k, kpt in enumerate(self.wfs.kpt_u): kpt.C2_nM = kpt.C_nM.copy()
from gpaw.blacs import BlacsGrid, Redistributor if world.size < 2: raise ValueError('Runs on two or more processors') grid = BlacsGrid(world, 2, world.size // 2) desc = grid.new_descriptor(12, 8, 2, 3) a = desc.zeros() a[:] = world.rank subdesc = grid.new_descriptor(7, 7, 2, 2) b = subdesc.zeros() r = Redistributor(grid.comm, desc, subdesc, uplo='G') ia = 3 ja = 2 ib = 1 jb = 1 M = 4 N = 5 r.redistribute(a, b, M, N, ia, ja, ib, jb) a0 = desc.collect_on_master(a) b0 = subdesc.collect_on_master(b) if world.rank == 0: print a0 print b0