def initialize(self, wfs): self.timer = wfs.timer self.world = wfs.world self.kpt_comm = wfs.kd.comm self.band_comm = wfs.band_comm self.dtype = wfs.dtype self.bd = wfs.bd self.ksl = wfs.diagksl self.nbands = wfs.bd.nbands self.mynbands = wfs.bd.mynbands self.operator = wfs.matrixoperator if self.mynbands != self.nbands or self.operator.nblocks != 1: self.keep_htpsit = False if self.keep_htpsit: self.Htpsit_nG = wfs.empty(self.nbands) if use_mic: self.Htpsit_nG_mic = stream.bind(self.Htpsit_nG) stream.sync() # Preconditioner for the electronic gradients: self.preconditioner = wfs.make_preconditioner(self.blocksize) for kpt in wfs.kpt_u: if kpt.eps_n is None: kpt.eps_n = np.empty(self.mynbands) # Allocate arrays for matrix operator self.operator.allocate_arrays() self.initialized = True
def rk(alpha, a, beta, c, trans='c'): """Rank-k update of a matrix.""" assert isinstance(a, mic.OffloadArray) assert isinstance(c, mic.OffloadArray) dt = map_dtype(a.dtype) # determine sizes of the matrices am = a.shape[0] ak = np.prod(a.shape[1:]) ck = c.shape[0] cn = np.prod(c.shape[1:]) n, k = am, ak ldc = c.array.strides[0] / c.array.strides[1] if a.dtype in [np.complex]: alpha = complex(alpha) beta = complex(beta) # perform the offload stream.invoke(library.mic_syrk, dt, a, c, n, k, ldc, alpha, beta) stream.sync()
def initialize_from_lcao_coefficients(self, basis_functions, mynbands): for kpt in self.kpt_u: kpt.psit_nG = self.gd.zeros(self.bd.mynbands, self.dtype) basis_functions.lcao_to_grid(kpt.C_nM, kpt.psit_nG[:mynbands], kpt.q) kpt.C_nM = None if use_mic: kpt.psit_nG_mic = stream.bind(kpt.psit_nG) stream.sync()
def empty(self, n=(), dtype=float, global_array=False, pad=False, usemic=False): """Return new uninitialized 3D array for this domain. The type can be set with the ``dtype`` keyword (default: ``float``). Extra dimensions can be added with ``n=dim``. A global array spanning all domains can be allocated with ``global_array=True``.""" array = self._new_array(n, dtype, False, global_array, pad) if usemic: oa = stream.bind(array) stream.sync() return oa else: return array
def gemm(alpha, a, b, beta, c, transa='n'): # we want to make sure that we only use OffloadArrays here assert isinstance(a, mic.OffloadArray) assert isinstance(b, mic.OffloadArray) assert isinstance(c, mic.OffloadArray) # determine the datatype and map it to int dt = map_dtype(a.dtype) # determine sizes of the matrices am = a.shape[0] ak = np.prod(a.shape[1:]) bk = b.shape[0] bn = np.prod(b.shape[1:]) cm = c.shape[0] cn = np.prod(c.shape[1:]) # just some safety checks if transa == 'n': assert am == bn assert ak == cn assert bk == cm trans = 0 m, n, k = ak, bk, am lda = a.array.strides[0] / a.array.strides[-1] ldb = b.array.strides[0] / b.array.strides[1] ldc = c.array.strides[0] / c.array.strides[-1] else: assert am == cn assert ak == bn assert bk == cm trans = 1 m, n, k = am, bk, ak lda = k ldb = b.array.strides[0] / b.array.strides[-1] ldc = c.array.strides[0] / c.array.strides[1] if a.dtype in [np.complex]: alpha = complex(alpha) beta = complex(beta) # perform the offload stream.invoke(library.mic_gemm, dt, a, b, c, m, n, k, lda, ldb, ldc, alpha, beta, trans) stream.sync()
def allocate_arrays(self): ngroups = self.bd.comm.size mynbands = self.bd.mynbands dtype = self.dtype if ngroups > 1: self.A_qnn = np.zeros((self.Q, mynbands, mynbands), dtype) self.A_nn = self.bmd.zeros(dtype=dtype) if use_mic: self.A_nn_mic = stream.bind(self.A_nn) stream.sync() if ngroups == 1 and self.nblocks == 1: self.work1_xG = self.gd.empty(self.bd.mynbands, self.dtype) if use_mic: self.work1_xG_mic = stream.bind(self.work1_xG) stream.sync() else: self.work1_xG = self.gd.empty(self.X, self.dtype) self.work2_xG = self.gd.empty(self.X, self.dtype)
def H(psit_xG): if self.keep_htpsit: result_xG = Htpsit_nG else: if use_mic: result_xG = self.operator.work1_xG_mic else: result_xG = reshape(self.operator.work1_xG, psit_xG.shape) if use_mic: psit_xG.update_device() wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG.array, result_xG.array) result_xG.update_device() stream.sync() else: wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG, result_xG) hamiltonian.xc.apply_orbital_dependent_hamiltonian( kpt, psit_xG, result_xG, hamiltonian.dH_asp) return result_xG
def r2k(alpha, a, b, beta, c): """Rank-2k update of a matrix.""" assert isinstance(a, mic.OffloadArray) assert isinstance(b, mic.OffloadArray) assert isinstance(c, mic.OffloadArray) assert (map_dtype(a.dtype) != 2) # determine sizes of the matrices am = a.shape[0] ak = np.prod(a.shape[1:]) bm = b.shape[0] bk = np.prod(b.shape[1:]) ck = c.shape[0] cn = np.prod(c.shape[1:]) n, k = am, ak ldc = c.array.strides[0] / c.array.strides[1] stream.invoke(library.mic_dsyr2k, a, b, c, n, k, ldc, alpha, beta) stream.sync()
def r2k(alpha, a, b, beta, c): """Rank-2k update of a matrix.""" assert isinstance(a, mic.OffloadArray) assert isinstance(b, mic.OffloadArray) assert isinstance(c, mic.OffloadArray) assert(map_dtype(a.dtype) != 2) # determine sizes of the matrices am = a.shape[0] ak = np.prod(a.shape[1:]) bm = b.shape[0] bk = np.prod(b.shape[1:]) ck = c.shape[0] cn = np.prod(c.shape[1:]) n, k = am, ak ldc = c.array.strides[0] / c.array.strides[1] stream.invoke(library.mic_dsyr2k, a, b, c, n, k, ldc, alpha, beta) stream.sync()
def subspace_diagonalize(self, hamiltonian, wfs, kpt): """Diagonalize the Hamiltonian in the subspace of kpt.psit_nG *Htpsit_nG* is a work array of same size as psit_nG which contains the local part of the Hamiltonian times psit on exit First, the Hamiltonian (defined by *kin*, *vt_sG*, and *dH_asp*) is applied to the wave functions, then the *H_nn* matrix is calculated and diagonalized, and finally, the wave functions (and also Htpsit_nG are rotated. Also the projections *P_ani* are rotated. It is assumed that the wave functions *psit_nG* are orthonormal and that the integrals of projector functions and wave functions *P_ani* are already calculated. Return ratated wave functions and H applied to the rotated wave functions if self.keep_htpsit is True. """ if self.band_comm.size > 1 and wfs.bd.strided: raise NotImplementedError self.timer.start('Subspace diag') if use_mic: psit_nG = kpt.psit_nG_mic # psit_nG.update_device() # stream.sync() else: psit_nG = kpt.psit_nG P_ani = kpt.P_ani if self.keep_htpsit: if use_mic: Htpsit_nG = self.Htpsit_nG_mic else: Htpsit_nG = reshape(self.Htpsit_nG, psit_nG.shape) else: Htpsit_nG = None def H(psit_xG): if self.keep_htpsit: result_xG = Htpsit_nG else: if use_mic: result_xG = self.operator.work1_xG_mic else: result_xG = reshape(self.operator.work1_xG, psit_xG.shape) if use_mic: psit_xG.update_device() wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG.array, result_xG.array) result_xG.update_device() stream.sync() else: wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG, result_xG) hamiltonian.xc.apply_orbital_dependent_hamiltonian( kpt, psit_xG, result_xG, hamiltonian.dH_asp) return result_xG def dH(a, P_ni): return np.dot(P_ni, unpack(hamiltonian.dH_asp[a][kpt.s])) self.timer.start('calc_h_matrix') H_nn = self.operator.calculate_matrix_elements(psit_nG, P_ani, H, dH) hamiltonian.xc.correct_hamiltonian_matrix(kpt, H_nn) self.timer.stop('calc_h_matrix') diagonalization_string = repr(self.ksl) wfs.timer.start(diagonalization_string) self.ksl.diagonalize(H_nn, kpt.eps_n) # H_nn now contains the result of the diagonalization. wfs.timer.stop(diagonalization_string) self.timer.start('rotate_psi') psit_nG = self.operator.matrix_multiply(H_nn, psit_nG, P_ani) if self.keep_htpsit: if use_mic: Htpsit_nG = self.operator.matrix_multiply(H_nn, Htpsit_nG, out_nG=kpt.psit_nG_mic) else: Htpsit_nG = self.operator.matrix_multiply(H_nn, Htpsit_nG, out_nG=kpt.psit_nG) # Rotate orbital dependent XC stuff: hamiltonian.xc.rotate(kpt, H_nn) self.timer.stop('rotate_psi') self.timer.stop('Subspace diag') if use_mic: psit_nG.update_host() stream.sync() if self.keep_htpsit: Htpsit_nG.update_host() stream.sync() return psit_nG.array, Htpsit_nG.array else: return psit_nG.array, Htpsit_nG else: return psit_nG, Htpsit_nG
def orthonormalize(self, wfs, kpt, psit_nG=None): """Orthonormalizes the vectors a_nG with respect to the overlap. First, a Cholesky factorization C is done for the overlap matrix S_nn = <a_nG | S | a_nG> = C*_nn C_nn Cholesky matrix C is inverted and orthonormal vectors a_nG' are obtained as:: psit_nG' = inv(C_nn) psit_nG __ ~ _ \ -1 ~ _ psi (r) = ) C psi (r) n /__ nm m m Parameters ---------- psit_nG: ndarray, input/output On input the set of vectors to orthonormalize, on output the overlap-orthonormalized vectors. kpt: KPoint object: k-point object from kpoint.py. work_nG: ndarray Optional work array for overlap matrix times psit_nG. work_nn: ndarray Optional work array for overlap matrix. """ self.timer.start('Orthonormalize') if psit_nG is None: psit_nG = kpt.psit_nG if use_mic: psit_nG_mic = kpt.psit_nG_mic else: if use_mic: psit_nG_mic = stream.bind(psit_nG, update_device=False) stream.sync() P_ani = kpt.P_ani self.timer.start('projections') wfs.pt.integrate(psit_nG, P_ani, kpt.q) self.timer.stop('projections') # Construct the overlap matrix: operator = wfs.matrixoperator def S(psit_G): return psit_G def dS(a, P_ni): return np.dot(P_ni, wfs.setups[a].dO_ii) if use_mic: self.timer.start('calc_s_matrix') psit_nG_mic.update_device() stream.sync() S_nn = operator.calculate_matrix_elements(psit_nG_mic, P_ani, S, dS) self.timer.stop('calc_s_matrix') else: self.timer.start('calc_s_matrix') S_nn = operator.calculate_matrix_elements(psit_nG, P_ani, S, dS) self.timer.stop('calc_s_matrix') orthonormalization_string = repr(self.ksl) self.timer.start(orthonormalization_string) # if extra_parameters.get('sic', False): # # symmetric Loewdin Orthonormalization tri2full(S_nn, UL='L', map=np.conj) nrm_n = np.empty(S_nn.shape[0]) diagonalize(S_nn, nrm_n) nrm_nn = np.diag(1.0/np.sqrt(nrm_n)) S_nn = np.dot(np.dot(S_nn.T.conj(), nrm_nn), S_nn) else: # self.ksl.inverse_cholesky(S_nn) # S_nn now contains the inverse of the Cholesky factorization. # Let's call it something different: C_nn = S_nn del S_nn self.timer.stop(orthonormalization_string) self.timer.start('rotate_psi') if use_mic: operator.matrix_multiply(C_nn, psit_nG_mic, P_ani, out_nG=kpt.psit_nG_mic) kpt.psit_nG_mic.update_host() stream.sync() # kpt.psit_nG[:] = self.psit_nG_mic.array[:] else: operator.matrix_multiply(C_nn, psit_nG, P_ani, out_nG=kpt.psit_nG) self.timer.stop('rotate_psi') self.timer.stop('Orthonormalize')
def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None): """Calculate new linear combinations of wave functions. Results will be put in the *P_ani* dict and a new psit_nG returned:: __ __ ~ \ ~ ~a ~ \ ~a ~ psi <-- ) C psi and <p |psi > <-- ) C <p |psi > n /__ nn' n' i n /__ nn' i n' n' n' Parameters: C_NN: ndarray Matrix representation of the requested linear combinations. Even with a hermitian operator, this matrix need not be self-adjoint. However, unlike the results from calculate_matrix_elements, it is assumed that all matrix elements are filled in (use e.g. tri2full). psit_nG: ndarray Set of vectors in which the matrix elements are evaluated. P_ani: dict Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>. """ if self.A_nn is None: self.allocate_arrays() band_comm = self.bd.comm B = band_comm.size J = self.nblocks N = self.bd.mynbands C_NN = self.bmd.redistribute_input(C_NN) if B == 1 and J == 1: # Simple case: if use_mic: work_nG = self.work1_xG_mic else: work_nG = reshape(self.work1_xG, psit_nG.shape) if out_nG is None: out_nG = work_nG # out_nG[:] = 117 # gemm may not like nan's elif out_nG is psit_nG: work_nG[:] = psit_nG psit_nG = work_nG if use_mic: if self.gd.comm.rank == 0: offload_report(1) C_NN_mic = self.A_nn_mic C_NN_mic.array[:] = C_NN[:] C_NN_mic.update_device() stream.sync() mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG) if self.gd.comm.rank == 0: offload_report(0) else: self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG) if P_ani: for P_ni in P_ani.values(): gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni) return out_nG # Now it gets nasty! We parallelize over B groups of bands and # each grid chunk is divided in J smaller slices (less memory). Q = B # always non-hermitian XXX rank = band_comm.rank shape = psit_nG.shape psit_nG = psit_nG.reshape(N, -1) G = psit_nG.shape[1] # number of grid-points g = int(np.ceil(G / float(J))) # Buffers for send/receive of pre-multiplication versions of P_ani's. sbuf_nI = rbuf_nI = None if P_ani: sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()]) sbuf_nI = np.ascontiguousarray(sbuf_nI) if B > 1: rbuf_nI = np.empty_like(sbuf_nI) # Because of the amount of communication involved, we need to # be syncronized up to this point but only on the 1D band_comm # communication ring band_comm.barrier() while g * J >= G + g: # remove extra slice(s) J -= 1 assert 0 < g * J < G + g work1_xG = reshape(self.work1_xG, (self.X,) + psit_nG.shape[1:]) work2_xG = reshape(self.work2_xG, (self.X,) + psit_nG.shape[1:]) for j in range(J): G1 = j * g G2 = G1 + g if G2 > G: G2 = G g = G2 - G1 sbuf_ng = reshape(work1_xG, (N, g)) rbuf_ng = reshape(work2_xG, (N, g)) sbuf_ng[:] = psit_nG[:, G1:G2] beta = 0.0 cycle_P_ani = (j == J - 1 and P_ani) for q in range(Q): # Start sending currently buffered kets to rank below # and receiving next set of kets from rank above us. # If we're at the last slice, start cycling P_ani too. if q < Q - 1: self._initialize_cycle(sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # Calculate wave-function contributions from the current slice # of grid data by the current mynbands x mynbands matrix block. C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank) self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2]) # If we're at the last slice, add contributions to P_ani's. if cycle_P_ani: I1 = 0 for P_ni in P_ani.values(): I2 = I1 + P_ni.shape[1] gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni) I1 = I2 # Wait for all send/receives to finish before next iteration. # Swap send and receive buffer such that next becomes current. # If we're at the last slice, also finishes the P_ani cycle. if q < Q - 1: sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle( sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # First iteration was special because we initialized the kets if q == 0: beta = 1.0 psit_nG.shape = shape return psit_nG
def subspace_diagonalize(self, hamiltonian, wfs, kpt): """Diagonalize the Hamiltonian in the subspace of kpt.psit_nG *Htpsit_nG* is a work array of same size as psit_nG which contains the local part of the Hamiltonian times psit on exit First, the Hamiltonian (defined by *kin*, *vt_sG*, and *dH_asp*) is applied to the wave functions, then the *H_nn* matrix is calculated and diagonalized, and finally, the wave functions (and also Htpsit_nG are rotated. Also the projections *P_ani* are rotated. It is assumed that the wave functions *psit_nG* are orthonormal and that the integrals of projector functions and wave functions *P_ani* are already calculated. Return ratated wave functions and H applied to the rotated wave functions if self.keep_htpsit is True. """ if self.band_comm.size > 1 and wfs.bd.strided: raise NotImplementedError self.timer.start('Subspace diag') if use_mic: psit_nG = kpt.psit_nG_mic # psit_nG.update_device() # stream.sync() else: psit_nG = kpt.psit_nG P_ani = kpt.P_ani if self.keep_htpsit: if use_mic: Htpsit_nG = self.Htpsit_nG_mic else: Htpsit_nG = reshape(self.Htpsit_nG, psit_nG.shape) else: Htpsit_nG = None def H(psit_xG): if self.keep_htpsit: result_xG = Htpsit_nG else: if use_mic: result_xG = self.operator.work1_xG_mic else: result_xG = reshape(self.operator.work1_xG, psit_xG.shape) if use_mic: psit_xG.update_device() wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG.array, result_xG.array) result_xG.update_device() stream.sync() else: wfs.apply_pseudo_hamiltonian(kpt, hamiltonian, psit_xG, result_xG) hamiltonian.xc.apply_orbital_dependent_hamiltonian( kpt, psit_xG, result_xG, hamiltonian.dH_asp) return result_xG def dH(a, P_ni): return np.dot(P_ni, unpack(hamiltonian.dH_asp[a][kpt.s])) self.timer.start('calc_h_matrix') H_nn = self.operator.calculate_matrix_elements(psit_nG, P_ani, H, dH) hamiltonian.xc.correct_hamiltonian_matrix(kpt, H_nn) self.timer.stop('calc_h_matrix') diagonalization_string = repr(self.ksl) wfs.timer.start(diagonalization_string) self.ksl.diagonalize(H_nn, kpt.eps_n) # H_nn now contains the result of the diagonalization. wfs.timer.stop(diagonalization_string) self.timer.start('rotate_psi') psit_nG = self.operator.matrix_multiply(H_nn, psit_nG, P_ani) if self.keep_htpsit: if use_mic: Htpsit_nG = self.operator.matrix_multiply( H_nn, Htpsit_nG, out_nG=kpt.psit_nG_mic) else: Htpsit_nG = self.operator.matrix_multiply(H_nn, Htpsit_nG, out_nG=kpt.psit_nG) # Rotate orbital dependent XC stuff: hamiltonian.xc.rotate(kpt, H_nn) self.timer.stop('rotate_psi') self.timer.stop('Subspace diag') if use_mic: psit_nG.update_host() stream.sync() if self.keep_htpsit: Htpsit_nG.update_host() stream.sync() return psit_nG.array, Htpsit_nG.array else: return psit_nG.array, Htpsit_nG else: return psit_nG, Htpsit_nG
def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None): """Calculate new linear combinations of wave functions. Results will be put in the *P_ani* dict and a new psit_nG returned:: __ __ ~ \ ~ ~a ~ \ ~a ~ psi <-- ) C psi and <p |psi > <-- ) C <p |psi > n /__ nn' n' i n /__ nn' i n' n' n' Parameters: C_NN: ndarray Matrix representation of the requested linear combinations. Even with a hermitian operator, this matrix need not be self-adjoint. However, unlike the results from calculate_matrix_elements, it is assumed that all matrix elements are filled in (use e.g. tri2full). psit_nG: ndarray Set of vectors in which the matrix elements are evaluated. P_ani: dict Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>. """ if self.A_nn is None: self.allocate_arrays() band_comm = self.bd.comm B = band_comm.size J = self.nblocks N = self.bd.mynbands C_NN = self.bmd.redistribute_input(C_NN) if B == 1 and J == 1: # Simple case: if use_mic: work_nG = self.work1_xG_mic else: work_nG = reshape(self.work1_xG, psit_nG.shape) if out_nG is None: out_nG = work_nG # out_nG[:] = 117 # gemm may not like nan's elif out_nG is psit_nG: work_nG[:] = psit_nG psit_nG = work_nG if use_mic: if self.gd.comm.rank == 0: offload_report(1) C_NN_mic = self.A_nn_mic C_NN_mic.array[:] = C_NN[:] C_NN_mic.update_device() stream.sync() mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG) if self.gd.comm.rank == 0: offload_report(0) else: self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG) if P_ani: for P_ni in P_ani.values(): gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni) return out_nG # Now it gets nasty! We parallelize over B groups of bands and # each grid chunk is divided in J smaller slices (less memory). Q = B # always non-hermitian XXX rank = band_comm.rank shape = psit_nG.shape psit_nG = psit_nG.reshape(N, -1) G = psit_nG.shape[1] # number of grid-points g = int(np.ceil(G / float(J))) # Buffers for send/receive of pre-multiplication versions of P_ani's. sbuf_nI = rbuf_nI = None if P_ani: sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()]) sbuf_nI = np.ascontiguousarray(sbuf_nI) if B > 1: rbuf_nI = np.empty_like(sbuf_nI) # Because of the amount of communication involved, we need to # be syncronized up to this point but only on the 1D band_comm # communication ring band_comm.barrier() while g * J >= G + g: # remove extra slice(s) J -= 1 assert 0 < g * J < G + g work1_xG = reshape(self.work1_xG, (self.X, ) + psit_nG.shape[1:]) work2_xG = reshape(self.work2_xG, (self.X, ) + psit_nG.shape[1:]) for j in range(J): G1 = j * g G2 = G1 + g if G2 > G: G2 = G g = G2 - G1 sbuf_ng = reshape(work1_xG, (N, g)) rbuf_ng = reshape(work2_xG, (N, g)) sbuf_ng[:] = psit_nG[:, G1:G2] beta = 0.0 cycle_P_ani = (j == J - 1 and P_ani) for q in range(Q): # Start sending currently buffered kets to rank below # and receiving next set of kets from rank above us. # If we're at the last slice, start cycling P_ani too. if q < Q - 1: self._initialize_cycle(sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # Calculate wave-function contributions from the current slice # of grid data by the current mynbands x mynbands matrix block. C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank) self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2]) # If we're at the last slice, add contributions to P_ani's. if cycle_P_ani: I1 = 0 for P_ni in P_ani.values(): I2 = I1 + P_ni.shape[1] gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni) I1 = I2 # Wait for all send/receives to finish before next iteration. # Swap send and receive buffer such that next becomes current. # If we're at the last slice, also finishes the P_ani cycle. if q < Q - 1: sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle( sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # First iteration was special because we initialized the kets if q == 0: beta = 1.0 psit_nG.shape = shape return psit_nG
def orthonormalize(self, wfs, kpt, psit_nG=None): """Orthonormalizes the vectors a_nG with respect to the overlap. First, a Cholesky factorization C is done for the overlap matrix S_nn = <a_nG | S | a_nG> = C*_nn C_nn Cholesky matrix C is inverted and orthonormal vectors a_nG' are obtained as:: psit_nG' = inv(C_nn) psit_nG __ ~ _ \ -1 ~ _ psi (r) = ) C psi (r) n /__ nm m m Parameters ---------- psit_nG: ndarray, input/output On input the set of vectors to orthonormalize, on output the overlap-orthonormalized vectors. kpt: KPoint object: k-point object from kpoint.py. work_nG: ndarray Optional work array for overlap matrix times psit_nG. work_nn: ndarray Optional work array for overlap matrix. """ self.timer.start('Orthonormalize') if psit_nG is None: psit_nG = kpt.psit_nG if use_mic: psit_nG_mic = kpt.psit_nG_mic else: if use_mic: psit_nG_mic = stream.bind(psit_nG, update_device=False) stream.sync() P_ani = kpt.P_ani self.timer.start('projections') wfs.pt.integrate(psit_nG, P_ani, kpt.q) self.timer.stop('projections') # Construct the overlap matrix: operator = wfs.matrixoperator def S(psit_G): return psit_G def dS(a, P_ni): return np.dot(P_ni, wfs.setups[a].dO_ii) if use_mic: self.timer.start('calc_s_matrix') psit_nG_mic.update_device() stream.sync() S_nn = operator.calculate_matrix_elements(psit_nG_mic, P_ani, S, dS) self.timer.stop('calc_s_matrix') else: self.timer.start('calc_s_matrix') S_nn = operator.calculate_matrix_elements(psit_nG, P_ani, S, dS) self.timer.stop('calc_s_matrix') orthonormalization_string = repr(self.ksl) self.timer.start(orthonormalization_string) # if extra_parameters.get('sic', False): # # symmetric Loewdin Orthonormalization tri2full(S_nn, UL='L', map=np.conj) nrm_n = np.empty(S_nn.shape[0]) diagonalize(S_nn, nrm_n) nrm_nn = np.diag(1.0 / np.sqrt(nrm_n)) S_nn = np.dot(np.dot(S_nn.T.conj(), nrm_nn), S_nn) else: # self.ksl.inverse_cholesky(S_nn) # S_nn now contains the inverse of the Cholesky factorization. # Let's call it something different: C_nn = S_nn del S_nn self.timer.stop(orthonormalization_string) self.timer.start('rotate_psi') if use_mic: operator.matrix_multiply(C_nn, psit_nG_mic, P_ani, out_nG=kpt.psit_nG_mic) kpt.psit_nG_mic.update_host() stream.sync() # kpt.psit_nG[:] = self.psit_nG_mic.array[:] else: operator.matrix_multiply(C_nn, psit_nG, P_ani, out_nG=kpt.psit_nG) self.timer.stop('rotate_psi') self.timer.stop('Orthonormalize')
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" xshape = a_xg.shape[:-3] if b_yg is None: # Only one array: result = a_xg.reshape(xshape + (-1,)).sum(axis=-1) * self.dv if global_integral: if result.ndim == 0: result = self.comm.sum(result) else: self.comm.sum(result) return result if isinstance(a_xg, mic.OffloadArray): # offload arrays have to be contiguous in any case A_xg = a_xg B_yg = b_yg else: A_xg = np.ascontiguousarray(a_xg.reshape((-1,) + a_xg.shape[-3:])) B_yg = np.ascontiguousarray(b_yg.reshape((-1,) + b_yg.shape[-3:])) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype) else: result_yx = _transposed_result global_integral = False if isinstance(a_xg, mic.OffloadArray): result_yx_mic = stream.bind(result_yx) stream.sync() # result_yx_mic.fillfrom(result_yx) # result_yx_mic.array[:] = result_yx[:] # result_yx_mic.update_device() if a_xg is b_yg: if isinstance(a_xg, mic.OffloadArray): # dsyrk performs badly in MIC so use dgemm here # mic_rk(self.dv, A_xg, 0.0, result_yx_mic) mic_gemm(self.dv, A_xg, A_xg, 0.0, result_yx_mic, 'c') else: rk(self.dv, A_xg, 0.0, result_yx) elif hermitian: if isinstance(a_xg, mic.OffloadArray): mic_r2k(self.dv, A_xg, B_yg, 0.0, result_yx_mic) else: r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx) else: if isinstance(a_xg, mic.OffloadArray): mic_gemm(self.dv, A_xg, B_yg, 0.0, result_yx_mic, 'c') else: gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c') if isinstance(a_xg, mic.OffloadArray): result_yx_mic.update_host() stream.sync() if global_integral: self.comm.sum(result_yx) yshape = b_yg.shape[:-3] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result