a_mic = gd.empty(nbands, usemic=True) b_mic = gd.empty(nbands, usemic=True) c_mic = stream.bind(c) np.random.seed(10) a_mic.array[:] = np.random.random(a_mic.shape) b_mic.array[:] = np.random.random(b_mic.shape) # a_mic.update_device() # b_mic.update_device() # warm-up for i in range(3): a_mic.update_device() b_mic.update_device() gd.integrate(a_mic, b_mic, hermitian=False, _transposed_result=c) c_mic.update_device() mic_gemm(1.0, a_mic, c_mic, 0.0, b_mic) b_mic.update_host() t0 = time() # equal(np.sum(c), 3600.89536641, 1e-6) for i in range(repeats): a_mic.update_device() b_mic.update_device() gd.integrate(a_mic, b_mic, hermitian=False, _transposed_result=c) c_mic.update_device() mic_gemm(1.0, a_mic, c_mic, 0.0, b_mic) b_mic.update_host() t1 = time() if rank == 0: print "Check", np.sum(b_mic.array), "Time", (t1 - t0) / repeats
def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None): """Calculate new linear combinations of wave functions. Results will be put in the *P_ani* dict and a new psit_nG returned:: __ __ ~ \ ~ ~a ~ \ ~a ~ psi <-- ) C psi and <p |psi > <-- ) C <p |psi > n /__ nn' n' i n /__ nn' i n' n' n' Parameters: C_NN: ndarray Matrix representation of the requested linear combinations. Even with a hermitian operator, this matrix need not be self-adjoint. However, unlike the results from calculate_matrix_elements, it is assumed that all matrix elements are filled in (use e.g. tri2full). psit_nG: ndarray Set of vectors in which the matrix elements are evaluated. P_ani: dict Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>. """ if self.A_nn is None: self.allocate_arrays() band_comm = self.bd.comm B = band_comm.size J = self.nblocks N = self.bd.mynbands C_NN = self.bmd.redistribute_input(C_NN) if B == 1 and J == 1: # Simple case: if use_mic: work_nG = self.work1_xG_mic else: work_nG = reshape(self.work1_xG, psit_nG.shape) if out_nG is None: out_nG = work_nG # out_nG[:] = 117 # gemm may not like nan's elif out_nG is psit_nG: work_nG[:] = psit_nG psit_nG = work_nG if use_mic: if self.gd.comm.rank == 0: offload_report(1) C_NN_mic = self.A_nn_mic C_NN_mic.array[:] = C_NN[:] C_NN_mic.update_device() stream.sync() mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG) if self.gd.comm.rank == 0: offload_report(0) else: self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG) if P_ani: for P_ni in P_ani.values(): gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni) return out_nG # Now it gets nasty! We parallelize over B groups of bands and # each grid chunk is divided in J smaller slices (less memory). Q = B # always non-hermitian XXX rank = band_comm.rank shape = psit_nG.shape psit_nG = psit_nG.reshape(N, -1) G = psit_nG.shape[1] # number of grid-points g = int(np.ceil(G / float(J))) # Buffers for send/receive of pre-multiplication versions of P_ani's. sbuf_nI = rbuf_nI = None if P_ani: sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()]) sbuf_nI = np.ascontiguousarray(sbuf_nI) if B > 1: rbuf_nI = np.empty_like(sbuf_nI) # Because of the amount of communication involved, we need to # be syncronized up to this point but only on the 1D band_comm # communication ring band_comm.barrier() while g * J >= G + g: # remove extra slice(s) J -= 1 assert 0 < g * J < G + g work1_xG = reshape(self.work1_xG, (self.X, ) + psit_nG.shape[1:]) work2_xG = reshape(self.work2_xG, (self.X, ) + psit_nG.shape[1:]) for j in range(J): G1 = j * g G2 = G1 + g if G2 > G: G2 = G g = G2 - G1 sbuf_ng = reshape(work1_xG, (N, g)) rbuf_ng = reshape(work2_xG, (N, g)) sbuf_ng[:] = psit_nG[:, G1:G2] beta = 0.0 cycle_P_ani = (j == J - 1 and P_ani) for q in range(Q): # Start sending currently buffered kets to rank below # and receiving next set of kets from rank above us. # If we're at the last slice, start cycling P_ani too. if q < Q - 1: self._initialize_cycle(sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # Calculate wave-function contributions from the current slice # of grid data by the current mynbands x mynbands matrix block. C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank) self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2]) # If we're at the last slice, add contributions to P_ani's. if cycle_P_ani: I1 = 0 for P_ni in P_ani.values(): I2 = I1 + P_ni.shape[1] gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni) I1 = I2 # Wait for all send/receives to finish before next iteration. # Swap send and receive buffer such that next becomes current. # If we're at the last slice, also finishes the P_ani cycle. if q < Q - 1: sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle( sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # First iteration was special because we initialized the kets if q == 0: beta = 1.0 psit_nG.shape = shape return psit_nG
def matrix_multiply(self, C_NN, psit_nG, P_ani=None, out_nG=None): """Calculate new linear combinations of wave functions. Results will be put in the *P_ani* dict and a new psit_nG returned:: __ __ ~ \ ~ ~a ~ \ ~a ~ psi <-- ) C psi and <p |psi > <-- ) C <p |psi > n /__ nn' n' i n /__ nn' i n' n' n' Parameters: C_NN: ndarray Matrix representation of the requested linear combinations. Even with a hermitian operator, this matrix need not be self-adjoint. However, unlike the results from calculate_matrix_elements, it is assumed that all matrix elements are filled in (use e.g. tri2full). psit_nG: ndarray Set of vectors in which the matrix elements are evaluated. P_ani: dict Dictionary of projector overlap integrals P_ni = <p_i | psit_nG>. """ if self.A_nn is None: self.allocate_arrays() band_comm = self.bd.comm B = band_comm.size J = self.nblocks N = self.bd.mynbands C_NN = self.bmd.redistribute_input(C_NN) if B == 1 and J == 1: # Simple case: if use_mic: work_nG = self.work1_xG_mic else: work_nG = reshape(self.work1_xG, psit_nG.shape) if out_nG is None: out_nG = work_nG # out_nG[:] = 117 # gemm may not like nan's elif out_nG is psit_nG: work_nG[:] = psit_nG psit_nG = work_nG if use_mic: if self.gd.comm.rank == 0: offload_report(1) C_NN_mic = self.A_nn_mic C_NN_mic.array[:] = C_NN[:] C_NN_mic.update_device() stream.sync() mic_gemm(1.0, psit_nG, C_NN_mic, 0.0, out_nG) if self.gd.comm.rank == 0: offload_report(0) else: self.gd.gemm(1.0, psit_nG, C_NN, 0.0, out_nG) if P_ani: for P_ni in P_ani.values(): gemm(1.0, P_ni.copy(), C_NN, 0.0, P_ni) return out_nG # Now it gets nasty! We parallelize over B groups of bands and # each grid chunk is divided in J smaller slices (less memory). Q = B # always non-hermitian XXX rank = band_comm.rank shape = psit_nG.shape psit_nG = psit_nG.reshape(N, -1) G = psit_nG.shape[1] # number of grid-points g = int(np.ceil(G / float(J))) # Buffers for send/receive of pre-multiplication versions of P_ani's. sbuf_nI = rbuf_nI = None if P_ani: sbuf_nI = np.hstack([P_ni for P_ni in P_ani.values()]) sbuf_nI = np.ascontiguousarray(sbuf_nI) if B > 1: rbuf_nI = np.empty_like(sbuf_nI) # Because of the amount of communication involved, we need to # be syncronized up to this point but only on the 1D band_comm # communication ring band_comm.barrier() while g * J >= G + g: # remove extra slice(s) J -= 1 assert 0 < g * J < G + g work1_xG = reshape(self.work1_xG, (self.X,) + psit_nG.shape[1:]) work2_xG = reshape(self.work2_xG, (self.X,) + psit_nG.shape[1:]) for j in range(J): G1 = j * g G2 = G1 + g if G2 > G: G2 = G g = G2 - G1 sbuf_ng = reshape(work1_xG, (N, g)) rbuf_ng = reshape(work2_xG, (N, g)) sbuf_ng[:] = psit_nG[:, G1:G2] beta = 0.0 cycle_P_ani = (j == J - 1 and P_ani) for q in range(Q): # Start sending currently buffered kets to rank below # and receiving next set of kets from rank above us. # If we're at the last slice, start cycling P_ani too. if q < Q - 1: self._initialize_cycle(sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # Calculate wave-function contributions from the current slice # of grid data by the current mynbands x mynbands matrix block. C_nn = self.bmd.extract_block(C_NN, (rank + q) % B, rank) self.gd.gemm(1.0, sbuf_ng, C_nn, beta, psit_nG[:, G1:G2]) # If we're at the last slice, add contributions to P_ani's. if cycle_P_ani: I1 = 0 for P_ni in P_ani.values(): I2 = I1 + P_ni.shape[1] gemm(1.0, sbuf_nI[:, I1:I2], C_nn, beta, P_ni) I1 = I2 # Wait for all send/receives to finish before next iteration. # Swap send and receive buffer such that next becomes current. # If we're at the last slice, also finishes the P_ani cycle. if q < Q - 1: sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI = self._finish_cycle( sbuf_ng, rbuf_ng, sbuf_nI, rbuf_nI, cycle_P_ani) # First iteration was special because we initialized the kets if q == 0: beta = 1.0 psit_nG.shape = shape return psit_nG
def integrate(self, a_xg, b_yg=None, global_integral=True, hermitian=False, _transposed_result=None): """Integrate function(s) over domain. a_xg: ndarray Function(s) to be integrated. b_yg: ndarray If present, integrate a_xg.conj() * b_yg. global_integral: bool If the array(s) are distributed over several domains, then the total sum will be returned. To get the local contribution only, use global_integral=False. hermitian: bool Result is hermitian. _transposed_result: ndarray Long story. Don't use this unless you are a method of the MatrixOperator class ...""" xshape = a_xg.shape[:-3] if b_yg is None: # Only one array: result = a_xg.reshape(xshape + (-1,)).sum(axis=-1) * self.dv if global_integral: if result.ndim == 0: result = self.comm.sum(result) else: self.comm.sum(result) return result if isinstance(a_xg, mic.OffloadArray): # offload arrays have to be contiguous in any case A_xg = a_xg B_yg = b_yg else: A_xg = np.ascontiguousarray(a_xg.reshape((-1,) + a_xg.shape[-3:])) B_yg = np.ascontiguousarray(b_yg.reshape((-1,) + b_yg.shape[-3:])) if _transposed_result is None: result_yx = np.zeros((len(B_yg), len(A_xg)), A_xg.dtype) else: result_yx = _transposed_result global_integral = False if isinstance(a_xg, mic.OffloadArray): result_yx_mic = stream.bind(result_yx) stream.sync() # result_yx_mic.fillfrom(result_yx) # result_yx_mic.array[:] = result_yx[:] # result_yx_mic.update_device() if a_xg is b_yg: if isinstance(a_xg, mic.OffloadArray): # dsyrk performs badly in MIC so use dgemm here # mic_rk(self.dv, A_xg, 0.0, result_yx_mic) mic_gemm(self.dv, A_xg, A_xg, 0.0, result_yx_mic, 'c') else: rk(self.dv, A_xg, 0.0, result_yx) elif hermitian: if isinstance(a_xg, mic.OffloadArray): mic_r2k(self.dv, A_xg, B_yg, 0.0, result_yx_mic) else: r2k(0.5 * self.dv, A_xg, B_yg, 0.0, result_yx) else: if isinstance(a_xg, mic.OffloadArray): mic_gemm(self.dv, A_xg, B_yg, 0.0, result_yx_mic, 'c') else: gemm(self.dv, A_xg, B_yg, 0.0, result_yx, 'c') if isinstance(a_xg, mic.OffloadArray): result_yx_mic.update_host() stream.sync() if global_integral: self.comm.sum(result_yx) yshape = b_yg.shape[:-3] result = result_yx.T.reshape(xshape + yshape) if result.ndim == 0: return result.item() else: return result
# a_mic.array = a_mic.array.reshape(a_mic.shape[:-3] + (-1,)) a_mic.update_host() a_mic_sum = np.sum(a_mic.array) print " sum(a_mic)=" + str(a_mic_sum) b_mic = mic.offload_array(b.shape, dtype=float) b_mic.fillfrom(b) # b_mic.update_device() # b_mic.array = b_mic.array.reshape(b_mic.shape[:-3] + (-1,)) b_mic.update_host() b_mic_sum = np.sum(b_mic.array) print " sum(b_mic)=" + str(b_mic_sum) # c_mic = offload_array(c.shape, dtype=float) # c_mic.fill(0.0); c_mic = device.associate(c) c_mic.update_device() t0 = time() for r in range(repeats): mic_gemm(alpha, a_mic, b_mic, beta, c_mic, "c") #mic_r2k(alpha, a_mic, b_mic, beta, c_mic) c_mic.update_host() c[:] = c_mic.array[:] t1 = time() print "MIC time", t1 - t0 print "MIC checks" c_mic_sum = np.sum(c_mic.array) print " sum(c_mic)=" + str(c_mic_sum) print " sum(c)=" + str(np.sum(c))