def calculate_density_matrix(self, f_n, C_nM, rho_MM=None): # ATLAS can't handle uninitialized output array: #rho_MM.fill(42) self.timer.start('Calculate density matrix') rho_MM = self.ksl.calculate_density_matrix(f_n, C_nM, rho_MM) self.timer.stop('Calculate density matrix') return rho_MM # ---------------------------- if 1: # XXX Should not conjugate, but call gemm(..., 'c') # Although that requires knowing C_Mn and not C_nM. # that also conforms better to the usual conventions in literature Cf_Mn = C_nM.T.conj() * f_n self.timer.start('gemm') gemm(1.0, C_nM, Cf_Mn, 0.0, rho_MM, 'n') self.timer.stop('gemm') self.timer.start('band comm sum') self.bd.comm.sum(rho_MM) self.timer.stop('band comm sum') else: # Alternative suggestion. Might be faster. Someone should test this from gpaw.utilities.blas import r2k C_Mn = C_nM.T.copy() r2k(0.5, C_Mn, f_n * C_Mn, 0.0, rho_MM) tri2full(rho_MM)
def _pseudo_braket(self, bra_xG, ket_yG, A_yx, square=None): """Calculate matrix elements of braket pairs of pseudo wave functions. Low-level helper function. Results will be put in the *A_yx* array:: / ~ * ~ A = | dG bra (G) ket (G) nn' / n n' Parameters: bra_xG: ndarray Set of bra-like vectors in which the matrix elements are evaluated. key_yG: ndarray Set of ket-like vectors in which the matrix elements are evaluated. A_yx: ndarray Matrix in which to put calculated elements. Take care: Due to the difference in Fortran/C array order and the inherent BLAS nature, the matrix has to be filled in transposed (conjugated in future?). """ assert bra_xG.shape[1:] == ket_yG.shape[1:] assert (ket_yG.shape[0], bra_xG.shape[0]) == A_yx.shape if square is None: square = (bra_xG.shape[0]==ket_yG.shape[0]) dv = self.gd.dv if ket_yG is bra_xG: rk(dv, bra_xG, 0.0, A_yx) elif self.hermitian and square: r2k(0.5 * dv, bra_xG, ket_yG, 0.0, A_yx) else: gemm(dv, bra_xG, ket_yG, 0.0, A_yx, 'c')
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" xshape = a_xg.shape[:-3] if b_yg is None: # Only one array: result = a_xg.reshape(xshape + (-1, )).sum(axis=-1) * self.dv if global_integral: if result.ndim == 0: result = self.comm.sum(result) else: self.comm.sum(result) return result A_xg = np.ascontiguousarray(a_xg.reshape((-1, ) + a_xg.shape[-3:])) B_yg = np.ascontiguousarray(b_yg.reshape((-1, ) + b_yg.shape[-3:])) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype) else: result_yx = _transposed_result global_integral = False if a_xg is b_yg: rk(self.dv, A_xg, 0.0, result_yx) elif hermitian: r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx) else: gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c') if global_integral: self.comm.sum(result_yx) yshape = b_yg.shape[:-3] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result
def get_vxc(paw, spin=0, U=None): """Calculate matrix elements of the xc-potential.""" assert not paw.hamiltonian.xc.xcfunc.orbital_dependent, "LDA/GGA's only" assert paw.wfs.dtype == float, 'Complex waves not implemented' if U is not None: # Rotate xc matrix return np.dot(U.T.conj(), np.dot(get_vxc(paw, spin), U)) gd = paw.hamiltonian.gd psit_nG = paw.wfs.kpt_u[spin].psit_nG[:] if paw.density.nt_sg is None: paw.density.interpolate_pseudo_density() nt_g = paw.density.nt_sg[spin] vxct_g = paw.density.finegd.zeros() paw.hamiltonian.xc.get_energy_and_potential(nt_g, vxct_g) vxct_G = gd.empty() paw.hamiltonian.restrict(vxct_g, vxct_G) Vxc_nn = np.zeros((paw.wfs.bd.nbands, paw.wfs.bd.nbands)) # Apply pseudo part r2k(.5 * gd.dv, psit_nG, vxct_G * psit_nG, .0, Vxc_nn) # lower triangle tri2full(Vxc_nn, 'L') # Fill in upper triangle from lower gd.comm.sum(Vxc_nn) # Add atomic PAW corrections for a, P_ni in paw.wfs.kpt_u[spin].P_ani.items(): D_sp = paw.density.D_asp[a][:] H_sp = np.zeros_like(D_sp) paw.wfs.setups[a].xc_correction.calculate_energy_and_derivatives( D_sp, H_sp) H_ii = unpack(H_sp[spin]) Vxc_nn += np.dot(P_ni, np.dot(H_ii, P_ni.T)) return Vxc_nn * Hartree
def _pseudo_braket(self, bra_xG, ket_yG, A_yx, square=None): """Calculate matrix elements of braket pairs of pseudo wave functions. Low-level helper function. Results will be put in the *A_yx* array:: / ~ * ~ A = | dG bra (G) ket (G) nn' / n n' Parameters: bra_xG: ndarray Set of bra-like vectors in which the matrix elements are evaluated. key_yG: ndarray Set of ket-like vectors in which the matrix elements are evaluated. A_yx: ndarray Matrix in which to put calculated elements. Take care: Due to the difference in Fortran/C array order and the inherent BLAS nature, the matrix has to be filled in transposed (conjugated in future?). """ assert bra_xG.shape[1:] == ket_yG.shape[1:] assert (ket_yG.shape[0], bra_xG.shape[0]) == A_yx.shape if square is None: square = (bra_xG.shape[0] == ket_yG.shape[0]) dv = self.gd.dv if ket_yG is bra_xG: rk(dv, bra_xG, 0.0, A_yx) elif self.hermitian and square: r2k(0.5 * dv, bra_xG, ket_yG, 0.0, A_yx) else: gemm(dv, bra_xG, ket_yG, 0.0, A_yx, 'c')
def alternative_calculate_density_matrix(self, f_n, C_nM, rho_MM=None): if rho_MM is None: rho_MM = np.zeros((self.mynao, self.nao), dtype=C_nM.dtype) # Alternative suggestion. Might be faster. Someone should test this C_Mn = C_nM.T.copy() r2k(0.5, C_Mn, f_n * C_Mn, 0.0, rho_MM) tri2full(rho_MM) return rho_MM
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" xshape = a_xg.shape[:-3] if b_yg is None: # Only one array: result = a_xg.reshape(xshape + (-1,)).sum(axis=-1) * self.dv if global_integral: if result.ndim == 0: result = self.comm.sum(result) else: self.comm.sum(result) return result A_xg = np.ascontiguousarray(a_xg.reshape((-1,) + a_xg.shape[-3:])) B_yg = np.ascontiguousarray(b_yg.reshape((-1,) + b_yg.shape[-3:])) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype) else: result_yx = _transposed_result global_integral = False if a_xg is b_yg: rk(self.dv, A_xg, 0.0, result_yx) elif hermitian: r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx) else: gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c') if global_integral: self.comm.sum(result_yx) yshape = b_yg.shape[:-3] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result
def multiply(self, alpha, a, opa, b, opb, beta, c, symmetric): if symmetric: assert opa == 'N' assert opb == 'C' or opb == 'T' and a.dtype == float if a is b: blas.rk(alpha, a.array, beta, c.array) else: if beta == 1.0 and a.shape[1] == 0: return blas.r2k(0.5 * alpha, a.array, b.array, beta, c.array) else: blas.mmm(alpha, a.array, opa, b.array, opb, beta, c.array)
def soft_pseudo(self, paw, H_nn, h_nn=None, u=0): if h_nn is None: h_nn = H_nn kpt = paw.wfs.kpt_u[u] pd = self.pair_density deg = 2 / self.nspins fmin = 1e-9 Htpsit_nG = np.zeros(kpt.psit_nG.shape, self.dtype) for n1 in range(self.nbands): psit1_G = kpt.psit_nG[n1] f1 = kpt.f_n[n1] / deg for n2 in range(n1, self.nbands): psit2_G = kpt.psit_nG[n2] f2 = kpt.f_n[n2] / deg if f1 < fmin and f2 < fmin: continue pd.initialize(kpt, n1, n2) pd.get_coarse(self.nt_G) pd.add_compensation_charges(self.nt_G, self.rhot_g) self.poisson_solve(self.vt_g, -self.rhot_g, charge=-float(n1 == n2), eps=1e-12, zero_initial_phi=True) self.restrict(self.vt_g, self.vt_G) Htpsit_nG[n1] += f2 * self.vt_G * psit2_G if n1 != n2: Htpsit_nG[n2] += f1 * self.vt_G * psit1_G v_aL = paw.density.ghat.dict() paw.density.ghat.integrate(self.vt_g, v_aL) for a, v_L in v_aL.items(): v_ii = unpack(np.dot(paw.wfs.setups[a].Delta_pL, v_L)) P_ni = kpt.P_ani[a] h_nn[:, n1] += f2 * np.dot(P_ni, np.dot(v_ii, P_ni[n2])) if n1 != n2: h_nn[:, n2] += f1 * np.dot(P_ni, np.dot(v_ii, P_ni[n1])) symmetrize(h_nn) # Grrrr why!!! XXX # Fill in lower triangle r2k(0.5 * self.dv, kpt.psit_nG[:], Htpsit_nG, 1.0, H_nn) # Fill in upper triangle from lower tri2full(H_nn, 'L')
def get_xc2(calc, w_wG, P_awi, spin=0): if calc.density.nt_sg is None: calc.density.interpolate() nt_g = calc.density.nt_sg[spin] vxct_g = calc.density.finegd.zeros() calc.hamiltonian.xc.get_energy_and_potential(nt_g, vxct_g) vxct_G = calc.wfs.gd.empty() calc.hamiltonian.restrict(vxct_g, vxct_G) # Integrate pseudo part Nw = len(w_wG) xc_ww = np.empty((Nw, Nw)) r2k(0.5 * calc.wfs.gd.dv, w_wG, vxct_G * w_wG, 0.0, xc_ww) tri2full(xc_ww, "L") # Add atomic PAW corrections for a, P_wi in P_awi.items(): D_sp = calc.density.D_asp[a][:] H_sp = np.zeros_like(D_sp) calc.wfs.setups[a].xc_correction.calculate_energy_and_derivatives(D_sp, H_sp) H_ii = unpack(H_sp[spin]) xc_ww += dots(P_wi, H_ii, P_wi.T.conj()) return xc_ww * Hartree
def get_xc2(calc, w_wG, P_awi, spin=0): if calc.density.nt_sg is None: calc.density.interpolate_pseudo_density() nt_g = calc.density.nt_sg[spin] vxct_g = calc.density.finegd.zeros() calc.hamiltonian.xc.get_energy_and_potential(nt_g, vxct_g) vxct_G = calc.wfs.gd.empty() calc.hamiltonian.restrict_and_collect(vxct_g, vxct_G) # Integrate pseudo part Nw = len(w_wG) xc_ww = np.empty((Nw, Nw)) r2k(.5 * calc.wfs.gd.dv, w_wG, vxct_G * w_wG, .0, xc_ww) tri2full(xc_ww, 'L') # Add atomic PAW corrections for a, P_wi in P_awi.items(): D_sp = calc.density.D_asp[a][:] H_sp = np.zeros_like(D_sp) calc.wfs.setups[a].xc_correction.calculate_energy_and_derivatives( D_sp, H_sp) H_ii = unpack(H_sp[spin]) xc_ww += dots(P_wi, H_ii, P_wi.T.conj()) return xc_ww * Hartree
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if (dtype==complex): epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) #gemm(1.0, A0, A0, 0.0, Z0, transa='t') print A0.shape, Z0.shape Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() print 'gemm err', gemm_err print 'gemv err', gemv_err print 'r2k err' , r2k_err print 'rk_err' , rk_err else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err,0, tol)
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean HEA0 = np.ascontiguousarray(HEA0) # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa='T') pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo='L', side='L') # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print('gemm err', gemm_err) print('gemv err', gemv_err) print('r2k err', r2k_err) print('rk_err', rk_err) print('hemm_err', hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)
assert not c.any() # Check axpy c = 5.j * a axpy(-5.j, a, c) assert not c.any() # Check rk c = np.tensordot(a, a.conj(), [[1, 2, 3], [1, 2, 3]]) rk(1., a, -1., c) tri2full(c) assert not c.any() # Check gemmdot for transa='c' c = np.tensordot(a, a2.conj(), [-1, -1]) gemmdot(a, a2, beta=-1., out=c, trans='c') assert not c.any() # Check gemmdot for transa='n' a2.shape = 3, 7, 5, 1 c = np.tensordot(a, a2, [-1, 0]) gemmdot(a, a2, beta=-1., out=c, trans='n') assert not c.any() # Check r2k a2 = 5. * a c = np.tensordot(a, a2.conj(), [[1, 2, 3], [1, 2, 3]]) r2k(.5, a, a2, -1., c) tri2full(c) assert not c.any()
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" if b_yg is None: # Only one array: assert self.dtype == float return a_xg[..., 0].real * self.gd.dv A_xg = a_xg.reshape((-1, a_xg.shape[-1])) B_yg = b_yg.reshape((-1, b_yg.shape[-1])) alpha = self.gd.dv / self.gd.N_c.prod() if self.dtype == float: alpha *= 2 A_xg = A_xg.view(float) B_yg = B_yg.view(float) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), self.dtype) else: result_yx = _transposed_result if a_xg is b_yg: rk(alpha, A_xg, 0.0, result_yx) elif hermitian: r2k(0.5 * alpha, A_xg, B_yg, 0.0, result_yx) else: gemm(alpha, A_xg, B_yg, 0.0, result_yx, 'c') if self.dtype == float: correction_yx = np.outer(B_yg[:, 0], A_xg[:, 0]) if hermitian: result_yx -= 0.25 * alpha * (correction_yx + correction_yx.T) else: result_yx -= 0.5 * alpha * correction_yx xshape = a_xg.shape[:-1] yshape = b_yg.shape[:-1] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" xshape = a_xg.shape[:-3] if b_yg is None: # Only one array: result = a_xg.reshape(xshape + (-1,)).sum(axis=-1) * self.dv if global_integral: if result.ndim == 0: result = self.comm.sum(result) else: self.comm.sum(result) return result if isinstance(a_xg, mic.OffloadArray): # offload arrays have to be contiguous in any case A_xg = a_xg B_yg = b_yg else: A_xg = np.ascontiguousarray(a_xg.reshape((-1,) + a_xg.shape[-3:])) B_yg = np.ascontiguousarray(b_yg.reshape((-1,) + b_yg.shape[-3:])) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype) else: result_yx = _transposed_result global_integral = False if isinstance(a_xg, mic.OffloadArray): result_yx_mic = stream.bind(result_yx) stream.sync() # result_yx_mic.fillfrom(result_yx) # result_yx_mic.array[:] = result_yx[:] # result_yx_mic.update_device() if a_xg is b_yg: if isinstance(a_xg, mic.OffloadArray): # dsyrk performs badly in MIC so use dgemm here # mic_rk(self.dv, A_xg, 0.0, result_yx_mic) mic_gemm(self.dv, A_xg, A_xg, 0.0, result_yx_mic, 'c') else: rk(self.dv, A_xg, 0.0, result_yx) elif hermitian: if isinstance(a_xg, mic.OffloadArray): mic_r2k(self.dv, A_xg, B_yg, 0.0, result_yx_mic) else: r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx) else: if isinstance(a_xg, mic.OffloadArray): mic_gemm(self.dv, A_xg, B_yg, 0.0, result_yx_mic, 'c') else: gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c') if isinstance(a_xg, mic.OffloadArray): result_yx_mic.update_host() stream.sync() if global_integral: self.comm.sum(result_yx) yshape = b_yg.shape[:-3] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result
def main(M=160, N=120, K=140, seed=42, mprocs=2, nprocs=2, dtype=float): gen = np.random.RandomState(seed) grid = BlacsGrid(world, mprocs, nprocs) if dtype == complex: epsilon = 1.0j else: epsilon = 0.0 # Create descriptors for matrices on master: globA = grid.new_descriptor(M, K, M, K) globB = grid.new_descriptor(K, N, K, N) globC = grid.new_descriptor(M, N, M, N) globZ = grid.new_descriptor(K, K, K, K) globX = grid.new_descriptor(K, 1, K, 1) globY = grid.new_descriptor(M, 1, M, 1) globD = grid.new_descriptor(M, K, M, K) globS = grid.new_descriptor(M, M, M, M) globU = grid.new_descriptor(M, M, M, M) globHEC = grid.new_descriptor(K, K, K, K) # print globA.asarray() # Populate matrices local to master: A0 = gen.rand(*globA.shape) + epsilon * gen.rand(*globA.shape) B0 = gen.rand(*globB.shape) + epsilon * gen.rand(*globB.shape) D0 = gen.rand(*globD.shape) + epsilon * gen.rand(*globD.shape) X0 = gen.rand(*globX.shape) + epsilon * gen.rand(*globX.shape) # HEC = HEA * B HEA0 = gen.rand(*globHEC.shape) + epsilon * gen.rand(*globHEC.shape) if world.rank == 0: HEA0 = HEA0 + HEA0.T.conjugate() # Make H0 hermitean # Local result matrices Y0 = globY.empty(dtype=dtype) C0 = globC.zeros(dtype=dtype) Z0 = globZ.zeros(dtype=dtype) S0 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U0 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC0 = globB.zeros(dtype=dtype) # Local reference matrix product: if rank == 0: # C0[:] = np.dot(A0, B0) gemm(1.0, B0, A0, 0.0, C0) # gemm(1.0, A0, A0, 0.0, Z0, transa='t') print(A0.shape, Z0.shape) Z0[:] = np.dot(A0.T, A0) # Y0[:] = np.dot(A0, X0) gemv(1.0, A0, X0.ravel(), 0.0, Y0.ravel()) r2k(1.0, A0, D0, 0.0, S0) rk(1.0, A0, 0.0, U0) HEC0[:] = np.dot(HEA0, B0) sM, sN = HEA0.shape # We don't use upper diagonal for i in range(sM): for j in range(sN): if i < j: HEA0[i][j] = 99999.0 if world.rank == 0: print(HEA0) assert globA.check(A0) and globB.check(B0) and globC.check(C0) assert globX.check(X0) and globY.check(Y0) assert globD.check(D0) and globS.check(S0) and globU.check(U0) # Create distributed destriptors with various block sizes: distA = grid.new_descriptor(M, K, 2, 2) distB = grid.new_descriptor(K, N, 2, 4) distC = grid.new_descriptor(M, N, 3, 2) distZ = grid.new_descriptor(K, K, 5, 7) distX = grid.new_descriptor(K, 1, 4, 1) distY = grid.new_descriptor(M, 1, 3, 1) distD = grid.new_descriptor(M, K, 2, 3) distS = grid.new_descriptor(M, M, 2, 2) distU = grid.new_descriptor(M, M, 2, 2) distHE = grid.new_descriptor(K, K, 2, 4) # Distributed matrices: A = distA.empty(dtype=dtype) B = distB.empty(dtype=dtype) C = distC.empty(dtype=dtype) Z = distZ.empty(dtype=dtype) X = distX.empty(dtype=dtype) Y = distY.empty(dtype=dtype) D = distD.empty(dtype=dtype) S = distS.zeros(dtype=dtype) # zeros needed for rank-updates U = distU.zeros(dtype=dtype) # zeros needed for rank-updates HEC = distB.zeros(dtype=dtype) HEA = distHE.zeros(dtype=dtype) Redistributor(world, globA, distA).redistribute(A0, A) Redistributor(world, globB, distB).redistribute(B0, B) Redistributor(world, globX, distX).redistribute(X0, X) Redistributor(world, globD, distD).redistribute(D0, D) Redistributor(world, globHEC, distHE).redistribute(HEA0, HEA) pblas_simple_gemm(distA, distB, distC, A, B, C) pblas_simple_gemm(distA, distA, distZ, A, A, Z, transa="T") pblas_simple_gemv(distA, distX, distY, A, X, Y) pblas_simple_r2k(distA, distD, distS, A, D, S) pblas_simple_rk(distA, distU, A, U) pblas_simple_hemm(distHE, distB, distB, HEA, B, HEC, uplo="L", side="L") # Collect result back on master C1 = globC.empty(dtype=dtype) Y1 = globY.empty(dtype=dtype) S1 = globS.zeros(dtype=dtype) # zeros needed for rank-updates U1 = globU.zeros(dtype=dtype) # zeros needed for rank-updates HEC1 = globB.zeros(dtype=dtype) Redistributor(world, distC, globC).redistribute(C, C1) Redistributor(world, distY, globY).redistribute(Y, Y1) Redistributor(world, distS, globS).redistribute(S, S1) Redistributor(world, distU, globU).redistribute(U, U1) Redistributor(world, distB, globB).redistribute(HEC, HEC1) if rank == 0: gemm_err = abs(C1 - C0).max() gemv_err = abs(Y1 - Y0).max() r2k_err = abs(S1 - S0).max() rk_err = abs(U1 - U0).max() hemm_err = abs(HEC1 - HEC0).max() print("gemm err", gemm_err) print("gemv err", gemv_err) print("r2k err", r2k_err) print("rk_err", rk_err) print("hemm_err", hemm_err) else: gemm_err = 0.0 gemv_err = 0.0 r2k_err = 0.0 rk_err = 0.0 hemm_err = 0.0 gemm_err = world.sum(gemm_err) # We don't like exceptions on only one cpu gemv_err = world.sum(gemv_err) r2k_err = world.sum(r2k_err) rk_err = world.sum(rk_err) hemm_err = world.sum(hemm_err) equal(gemm_err, 0, tol) equal(gemv_err, 0, tol) equal(r2k_err, 0, tol) equal(rk_err, 0, tol) equal(hemm_err, 0, tol)
def iterate_one_k_point(self, hamiltonian, wfs, kpt): """Do Davidson iterations for the kpoint""" niter = self.niter nbands = self.nbands self.subspace_diagonalize(hamiltonian, wfs, kpt) H_2n2n = self.H_2n2n S_2n2n = self.S_2n2n eps_2n = self.eps_2n psit2_nG = wfs.matrixoperator.suggest_temporary_buffer() self.timer.start('Davidson') R_nG = self.Htpsit_nG self.calculate_residuals(kpt, wfs, hamiltonian, kpt.psit_nG, kpt.P_ani, kpt.eps_n, R_nG) for nit in range(niter): H_2n2n[:] = 0.0 S_2n2n[:] = 0.0 error = 0.0 for n in range(nbands): if kpt.f_n is None: weight = kpt.weight else: weight = kpt.f_n[n] if self.nbands_converge != 'occupied': if n < self.nbands_converge: weight = kpt.weight else: weight = 0.0 error += weight * np.vdot(R_nG[n], R_nG[n]).real H_2n2n[n,n] = kpt.eps_n[n] S_2n2n[n,n] = 1.0 psit2_nG[n] = self.preconditioner(R_nG[n], kpt) # Calculate projections P2_ani = wfs.pt.dict(nbands) wfs.pt.integrate(psit2_nG, P2_ani, kpt.q) # Hamiltonian matrix # <psi2 | H | psi> wfs.kin.apply(psit2_nG, self.Htpsit_nG, kpt.phase_cd) hamiltonian.apply_local_potential(psit2_nG, self.Htpsit_nG, kpt.s) gemm(self.gd.dv, kpt.psit_nG, self.Htpsit_nG, 0.0, self.H_nn, 'c') for a, P_ni in kpt.P_ani.items(): P2_ni = P2_ani[a] dH_ii = unpack(hamiltonian.dH_asp[a][kpt.s]) self.H_nn += np.dot(P2_ni, np.dot(dH_ii, P_ni.T.conj())) self.gd.comm.sum(self.H_nn, 0) H_2n2n[nbands:, :nbands] = self.H_nn # <psi2 | H | psi2> r2k(0.5 * self.gd.dv, psit2_nG, self.Htpsit_nG, 0.0, self.H_nn) for a, P2_ni in P2_ani.items(): dH_ii = unpack(hamiltonian.dH_asp[a][kpt.s]) self.H_nn += np.dot(P2_ni, np.dot(dH_ii, P2_ni.T.conj())) self.gd.comm.sum(self.H_nn, 0) H_2n2n[nbands:, nbands:] = self.H_nn # Overlap matrix # <psi2 | S | psi> gemm(self.gd.dv, kpt.psit_nG, psit2_nG, 0.0, self.S_nn, "c") for a, P_ni in kpt.P_ani.items(): P2_ni = P2_ani[a] dO_ii = wfs.setups[a].dO_ii self.S_nn += np.dot(P2_ni, np.inner(dO_ii, P_ni.conj())) self.gd.comm.sum(self.S_nn, 0) S_2n2n[nbands:, :nbands] = self.S_nn # <psi2 | S | psi2> rk(self.gd.dv, psit2_nG, 0.0, self.S_nn) for a, P2_ni in P2_ani.items(): dO_ii = wfs.setups[a].dO_ii self.S_nn += np.dot(P2_ni, np.dot(dO_ii, P2_ni.T.conj())) self.gd.comm.sum(self.S_nn, 0) S_2n2n[nbands:, nbands:] = self.S_nn if self.gd.comm.rank == 0: general_diagonalize(H_2n2n, eps_2n, S_2n2n) self.gd.comm.broadcast(H_2n2n, 0) self.gd.comm.broadcast(eps_2n, 0) kpt.eps_n[:] = eps_2n[:nbands] # Rotate psit_nG gemm(1.0, kpt.psit_nG, H_2n2n[:nbands, :nbands], 0.0, self.Htpsit_nG) gemm(1.0, psit2_nG, H_2n2n[:nbands, nbands:], 1.0, self.Htpsit_nG) kpt.psit_nG, self.Htpsit_nG = self.Htpsit_nG, kpt.psit_nG # Rotate P_uni: for a, P_ni in kpt.P_ani.items(): P2_ni = P2_ani[a] gemm(1.0, P_ni.copy(), H_2n2n[:nbands, :nbands], 0.0, P_ni) gemm(1.0, P2_ni, H_2n2n[:nbands, nbands:], 1.0, P_ni) if nit < niter - 1 : wfs.kin.apply(kpt.psit_nG, self.Htpsit_nG, kpt.phase_cd) hamiltonian.apply_local_potential(kpt.psit_nG, self.Htpsit_nG, kpt.s) R_nG = self.Htpsit_nG self.calculate_residuals(kpt, wfs, hamiltonian, kpt.psit_nG, kpt.P_ani, kpt.eps_n, R_nG) self.timer.stop('Davidson') error = self.gd.comm.sum(error) return error