def make_gpu_norm(dtype): """ Returns a function c=vec_norm(A) that does c=sqrt(A'A) """ # GPU Code in gce.kernel. code = Template(""" if (_in_global) { norm_a += conj(Ax(0,0,0))*Ax(0,0,0); norm_a += conj(Ay(0,0,0))*Ay(0,0,0); norm_a += conj(Az(0,0,0))*Az(0,0,0); } """).render(type=_get_cuda_type(dtype)) # Compile the code using gce.Kernel grid_names = [A + i for A in ['A'] for i in ['x', 'y', 'z']] prod_fun = Kernel(code, \ ('norm_a', 'out', dtype), \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') norm_a = Out(dtype) # Define the actual function. def gpu_norm(A): prod_fun( norm_a,\ *( A )) # remove the post_sync return np.sqrt(norm_a.get()) return gpu_norm
def make_gpu_sum(dtype): """ Returns a function that does aA+bB=C """ # Code for the rho step function. code = Template(""" if (_in_global) { Cx(0,0,0) = a*Ax(0,0,0) + b*Bx(0,0,0); Cy(0,0,0) = a*Ay(0,0,0) + b*By(0,0,0); Cz(0,0,0) = a*Az(0,0,0) + b*Bz(0,0,0); } """).render(type=_get_cuda_type(dtype)) # Compile the code using gce.Kernel grid_names = [A + i for A in ['A', 'B', 'C'] for i in ['x', 'y', 'z']] Sum_fun = Kernel(code, \ ('a', 'number', dtype), \ ('b', 'number', dtype), \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') # Define the actual function. def gpu_sum(a, b, A, B, C): Sum_fun(dtype(a), dtype(b), \ *( A + B + C), \ post_sync=C) # r must be post-synced for upcoming alpha step. return gpu_sum
def make_gpu_dot(dtype): """ Returns a function c=vec_dot(A, B) that does c=A'B """ # GPU Code in gce.kernel. code = Template(""" if (_in_global) { dot_ab += conj(Ax(0,0,0))*Bx(0,0,0); dot_ab += conj(Ay(0,0,0))*By(0,0,0); dot_ab += conj(Az(0,0,0))*Bz(0,0,0); } """).render(type=_get_cuda_type(dtype)) # Compile the code using gce.Kernel grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']] prod_fun = Kernel(code, \ ('dot_ab', 'out', dtype), \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') dot_ab = Out(dtype) # Define the actual function. def gpu_dot(A, B): prod_fun( dot_ab,\ *( A + B)) return dot_ab.get() return gpu_dot
def rho_step(dtype): """ Return the function to execute the rho step of the bicg algorithm. """ # Code for the rho step function. code = Template(""" if (_in_global) { x0(0,0,0) = x0(0,0,0) + alpha * p0(0,0,0); x1(0,0,0) = x1(0,0,0) + alpha * p1(0,0,0); x2(0,0,0) = x2(0,0,0) + alpha * p2(0,0,0); {{ type }} s0 = r0(0,0,0) - alpha * v0(0,0,0); {{ type }} s1 = r1(0,0,0) - alpha * v1(0,0,0); {{ type }} s2 = r2(0,0,0) - alpha * v2(0,0,0); rho += (s0 * s0) + (s1 * s1) + (s2 * s2); err += (real(s0) * real(s0)) + \ (imag(s0) * imag(s0)) + \ (real(s1) * real(s1)) + \ (imag(s1) * imag(s1)) + \ (real(s2) * real(s2)) + \ (imag(s2) * imag(s2)); r0(0,0,0) = s0; r1(0,0,0) = s1; r2(0,0,0) = s2; } """).render(type=_get_cuda_type(dtype)) # Compile the code. grid_names = [A + i for A in ['p', 'r', 'v', 'x'] for i in ['0', '1', '2']] rho_fun = Kernel(code, \ ('alpha', 'number', dtype), \ ('rho', 'out', dtype), \ ('err', 'out', dtype), \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') # Temporary values that are needed. rho_out = Out(dtype) err_out = Out(dtype) # Define the actual function. def rho_step(alpha, p, r, v, x): rho_fun(dtype(alpha), rho_out, err_out, *(p + r + v + x), \ post_sync=r) # r must be post-synced for upcoming alpha step. return rho_out.get(), np.sqrt(err_out.get()) return rho_step
def make_gpu_copy(dtype): """ Returns a function that does B=A """ # Code for the rho step function. code = Template(""" if (_in_global) { Bx(0,0,0) = Ax(0,0,0); By(0,0,0) = Ay(0,0,0); Bz(0,0,0) = Az(0,0,0); } """).render(type=_get_cuda_type(dtype)) # Compile the code using gce.Kernel grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']] copy_fun = Kernel(code, \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') # Define the actual function. def gpu_copy(A, B): copy_fun( \ *( A + B), \ post_sync=B) # r must be post-synced for upcoming alpha step. return gpu_copy
def make_gpu_cond(dtype, cond): """ Returns a function gpu_cond(A) that does A=A*C """ # GPU Code in gce.Kernel code = Template(""" if (_in_global) { Ax(0,0,0) = Ax(0,0,0)*Cx(0,0,0); Ay(0,0,0) = Ay(0,0,0)*Cy(0,0,0); Az(0,0,0) = Az(0,0,0)*Cz(0,0,0); } """).render(type=_get_cuda_type(dtype)) # Compile the code using gce.Kernel grid_names = [A + i for A in ['A', 'C'] for i in ['x', 'y', 'z']] Sum_fun = Kernel(code, \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') C = cond # Define the actual function. def gpu_cond(A): Sum_fun(*( A + C ), \ post_sync=A) # r must be post-synced for upcoming alpha step. return gpu_cond
def make_gpu_scaled_copy(dtype): """ Returns a function vec_scaled_copy(A, a, B) that does B=aA """ # GPU code for the Kernel code = Template(""" if (_in_global) { Bx(0,0,0) = a*Ax(0,0,0); By(0,0,0) = a*Ay(0,0,0); Bz(0,0,0) = a*Az(0,0,0); } """).render(type=_get_cuda_type(dtype)) # Compile the code using gce.Kernel grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']] Sum_fun = Kernel( code, \ ('a', 'number', dtype), \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') # Define the actual function. def gpu_scaled_copy(A, a, B): Sum_fun(dtype(a), \ *( A + B ), \ post_sync=B) return gpu_scaled_copy
def make_gpu_addvec(dtype): """ Returns a function vec_addvec(A, b, B) that does A=A+bB """ # GPU Code in gce.Kernel code = Template(""" if (_in_global) { Ax(0,0,0) = Ax(0,0,0) + b*Bx(0,0,0); Ay(0,0,0) = Ay(0,0,0) + b*By(0,0,0); Az(0,0,0) = Az(0,0,0) + b*Bz(0,0,0); } """).render(type=_get_cuda_type(dtype)) # Compile the code using gce.Kernel grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']] Sum_fun = Kernel(code, \ ('b', 'number', dtype), \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') # Define the actual function. def gpu_addvec(A, b, B): Sum_fun( dtype(b), \ *( A + B ), \ post_sync=A) # r must be post-synced for upcoming alpha step. return gpu_addvec
def make_gpu_scale(dtype): """ Returns a function scale(A, a) that does A=aA """ # Code for the rho step function. code = Template(""" if (_in_global) { Ax(0,0,0) = a*Ax(0,0,0); Ay(0,0,0) = a*Ay(0,0,0); Az(0,0,0) = a*Az(0,0,0); } """).render(type=_get_cuda_type(dtype)) # Compile the code using gce.Kernel grid_names = [A + i for A in ['A'] for i in ['x', 'y', 'z']] Sum_fun = Kernel(code, \ ('a', 'number', dtype), \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') # Define the actual function. def gpu_scale(A, a): Sum_fun(dtype(a), \ *( A ), \ post_sync=A) # r must be post-synced for upcoming alpha step. return gpu_scale
def make_gpu_fdfd_matrix_multiplication(params, dtype): """ Return function vec_matrix_multiplication(X, B) that will do AX=B """ num_shared_banks = 6 # Render the pre-loop and in-loop code. cuda_type = _get_cuda_type(dtype) code_allpre = jinja_env.get_template('fdfd_matrix_multiplication_pec_pmc.cu').\ render(dims=params['shape'], \ type=cuda_type, \ mu_equals_1=False, \ full_operator=True) # Grid input parameters. grid_params = [(A + i, 'grid', dtype) for A in ['X', 'B', 'e', 'm'] \ for i in ['x', 'y', 'z']] # Const input parameters. const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \ ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \ 'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \ ('bloch_x', 'bloch_y', 'bloch_z') const_sizes = params['shape'] * 4 + tuple([3]) * 3 const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \ for k in range(len(const_sizes))] const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6)) # Compile. (note shape_filter = 'square') A_multiplication_fun = Kernel('', \ *(grid_params + const_params), \ pre_loop=code_allpre, \ padding=(1,1,1,1), \ smem_per_thread=num_shared_banks*16, \ shape_filter='square') # Temporary variables. # Grid variables. # !!!!! here eps is scattered over the GPUs when e intitialised e = [Grid(dtype(f), x_overlap=1) for f in params['e']] m = [Grid(dtype(f), x_overlap=1) for f in params['m']] # Optional. # Constant variables. sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']] sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']] sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']] sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']] bloch_x = [Const(dtype(params['bloch_phase'][0]))] bloch_y = [Const(dtype(params['bloch_phase'][1]))] bloch_z = [Const(dtype(params['bloch_phase'][2]))] pemc = [Const(params['pemc'])] # Define the function def gpu_fdfd_matrix_multiplication(X, B): # Execute cuda code. A_multiplication_fun( \ *(X + B + e + m + \ sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \ bloch_x + bloch_y + bloch_z + pemc), \ post_sync = B) return gpu_fdfd_matrix_multiplication
def alpha_step(params, dtype): """ Define the alpha step function needed for the bicg algorithm. """ num_shared_banks = 6 # Render the pre-loop and in-loop code. cuda_type = _get_cuda_type(dtype) code_allpre = jinja_env.get_template('alpha_allpre.cu').\ render(dims=params['shape'], \ type=cuda_type, \ mu_equals_1=False, \ full_operator=True) # Grid input parameters. grid_params = [(A + i, 'grid', dtype) for A in ['P', 'P1', 'R', 'V', 'e', 'm'] \ for i in ['x', 'y', 'z']] # Const input parameters. const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \ ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \ 'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') const_sizes = params['shape'] * 4 const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \ for k in range(len(const_sizes))] # Compile. alpha_fun = Kernel('', \ ('beta', 'number', dtype), \ ('alpha_denom', 'out', dtype), \ *(grid_params + const_params), \ pre_loop=code_allpre, \ padding=(1,1,1,1), \ smem_per_thread=num_shared_banks*16, \ shape_filter='square') # Temporary variables. alpha_denom_out = Out(dtype) p_temp = [Grid(dtype, x_overlap=1) for k in range(3)] # Used to swap p. # Grid variables. e = [Grid(dtype(f), x_overlap=1) for f in params['e']] m = [Grid(dtype(f), x_overlap=1) for f in params['m']] # Optional. # Constant variables. sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']] sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']] sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']] sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']] # Define the function def alpha_step(rho_k, rho_k_1, p, r, v): # Execute cuda code. # Notice that p_temp and v are post_synced. alpha_fun(dtype(rho_k/rho_k_1), alpha_denom_out, \ *(p + p_temp + r + v + e + m + \ sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1), \ post_sync=p_temp+v) p[:], p_temp[:] = p_temp[:], p[:] # Deep swap. return rho_k / alpha_denom_out.get() # The value of alpha. return alpha_step
def omega_biCGSTAB_step(params, dtype): """ Define the alpha step function needed for the bicg algorithm. """ ''' This returns a function that will perform the alpha_step, i.e. a part of the CG algorithm s = r - alpha * v t = A*s omega=(t*s)/(t*t) the function returns alpha note that in omega fun does not calculate calculate since t and s are scattered over the different MPI nodes however omega_num and omega_denom are calculated, or at least the part that the MPInode can calculate., and then put together by omega_step. ''' num_shared_banks = 6 # Render the pre-loop and in-loop code. cuda_type = _get_cuda_type(dtype) code_allpre = jinja_env.get_template('omega_bloch_pmc_pec.cu').\ render(dims=params['shape'], \ type=cuda_type, \ mu_equals_1=False, \ full_operator=True) # Grid input parameters. grid_params = [(A + i, 'grid', dtype) for A in ['V', 'S', 'R', 'T', 'e', 'm'] \ for i in ['x', 'y', 'z']] # Const input parameters. const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \ ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \ 'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \ ('bloch_x', 'bloch_y', 'bloch_z') const_sizes = params['shape'] * 4 + tuple([3]) * 3 const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \ for k in range(len(const_sizes))] const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6)) # Compile. (note shape_filter = 'square') omega_fun = Kernel('', \ ('alpha', 'number', dtype), \ ('omega_num', 'out', dtype), \ ('omega_denom', 'out', dtype), \ *(grid_params + const_params), \ pre_loop=code_allpre, \ padding=(1,1,1,1), \ smem_per_thread=num_shared_banks*16, \ shape_filter='square') # Temporary variables. omega_num_out = Out(dtype) omega_denom_out = Out(dtype) # Grid variables. e = [Grid(dtype(f), x_overlap=1) for f in params['e']] m = [Grid(dtype(f), x_overlap=1) for f in params['m']] # Optional. # Constant variables. sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']] sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']] sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']] sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']] bloch_x = [Const(dtype(params['bloch_phase'][0]))] bloch_y = [Const(dtype(params['bloch_phase'][1]))] bloch_z = [Const(dtype(params['bloch_phase'][2]))] pemc = [Const(params['pemc'])] # Define the function def omega_step(alpha, V, S, R, T, compute_omega=True): # Execute cuda code. # Notice that H, S and T are post_synced. omega_fun(dtype(alpha), omega_num_out, omega_denom_out, \ *( V + S + R + T + e + m + \ sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \ bloch_x + bloch_y + bloch_z + pemc), \ post_sync= S + T ) if compute_omega: return omega_num_out.get() / omega_denom_out.get( ) # The value of omega. return omega_step
def alpha_biCGSTAB_step(params, dtype): """ Define the alpha step function needed for the bicg algorithm. """ ''' This returns a function that will perform the alpha_step, i.e. a part of the biCGSTAB algorithm p=r+rho[k]/rho[k+1]*alpha/omega*(p-omega*v) v=A*p alpha=rho/(p*v) the function returns alpha note that in alpha fun does not calculate alpha since p and v are scattered over the different MPI nodes however alpha_denom is calculated, or at least the part that the MPInode can calculate., and then put together by alpha_step. ''' num_shared_banks = 6 # Render the pre-loop and in-loop code. cuda_type = _get_cuda_type(dtype) #code_allpre = jinja_env.get_template('alpha_biCGSTAB.cu').\ code_allpre = jinja_env.get_template('alpha_bloch_pmc_pec.cu').\ render(dims=params['shape'], \ type=cuda_type, \ mu_equals_1=False, \ full_operator=True) # Grid input parameters. grid_params = [(A + i, 'grid', dtype) for A in ['P', 'P1', 'R', 'R_hatH', 'V', 'V1', 'e', 'm'] \ for i in ['x', 'y', 'z']] # Const input parameters. const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \ ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \ 'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \ ('bloch_x', 'bloch_y', 'bloch_z') const_sizes = params['shape'] * 4 + tuple([3]) * 3 const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \ for k in range(len(const_sizes))] const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6)) # Compile. (note shape_filter = 'square') alpha_fun = Kernel('', \ ('beta', 'number', dtype), \ ('omega', 'number', dtype), \ ('alpha_denom', 'out', dtype), \ *(grid_params + const_params), \ pre_loop=code_allpre, \ padding=(1,1,1,1), \ smem_per_thread=num_shared_banks*16, \ shape_filter='square') # Temporary variables. alpha_denom_out = Out(dtype) P_temp = [Grid(dtype, x_overlap=1) for k in range(3)] # Used to swap p. V_temp = [Grid(dtype, x_overlap=1) for k in range(3)] # Used to swap v. # Grid variables. # !!!!! here eps is scattered over the GPUs when e intitialised e = [Grid(dtype(f), x_overlap=1) for f in params['e']] m = [Grid(dtype(f), x_overlap=1) for f in params['m']] # Optional. # Constant variables. sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']] sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']] sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']] sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']] bloch_x = [Const(dtype(params['bloch_phase'][0]))] bloch_y = [Const(dtype(params['bloch_phase'][1]))] bloch_z = [Const(dtype(params['bloch_phase'][2]))] pemc = [Const(params['pemc'])] # Define the function def alpha_biCGSTAB_step(rho_k, rho_k_1, alpha, omega, P, R, R_hatH, V, compute_alpha=True): # Execute cuda code. # Notice that p_temp and v are post_synced. alpha_fun(dtype((rho_k*alpha)/(rho_k_1*omega)), dtype(omega), alpha_denom_out, \ *(P + P_temp + R + R_hatH + V + V_temp + e + m + \ sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \ bloch_x + bloch_y + bloch_z + pemc), \ post_sync = P_temp + V_temp) P[:], P_temp[:] = P_temp[:], P[:] # Deep swap. V[:], V_temp[:] = V_temp[:], V[:] # Deep swap # TODO(logansu): Remove compute_alpha solve_symm_lumped does not use # alpha_step to solve for the matrix. Because solve_symm_lumped sets # r to zero vector to compute the matrix multiplication, alpha_denom_out # comes out to be zero. The if-statement stops DivisionByZero when this # happens (which is important for us to catch real DivisionByZero # errors). if compute_alpha: return rho_k / alpha_denom_out.get() # The value of alpha. return alpha_biCGSTAB_step
def rho_biCGSTAB_step(dtype): """ Return the function to execute the rho step of the bicg algorithm. """ ''' This returns a function that will perform the rho_step, i.e. a part of the biCGSTAB algorithm x=x+alpha*p+omega*s r=s-omega*t rho[k+1]=r_hatH*r err=conj(r)*r the function returns rho[k+1] and err (it is returned to the CPU where it is gathered and summed!) ''' # Code for the rho step function. code = Template(""" if (_in_global) { X1x(0,0,0) = Xx(0,0,0) + alpha*Px(0,0,0) + omega * Sx(0,0,0); X1y(0,0,0) = Xy(0,0,0) + alpha*Py(0,0,0) + omega * Sy(0,0,0); X1z(0,0,0) = Xz(0,0,0) + alpha*Pz(0,0,0) + omega * Sz(0,0,0); {{ type }} R_tmpx = Sx(0,0,0) - omega * Tx(0,0,0); {{ type }} R_tmpy = Sy(0,0,0) - omega * Ty(0,0,0); {{ type }} R_tmpz = Sz(0,0,0) - omega * Tz(0,0,0); rho += (R_hatHx(0,0,0) * R_tmpx) + (R_hatHy(0,0,0) * R_tmpy) + (R_hatHz(0,0,0) * R_tmpz); err += (real(R_tmpx) * real(R_tmpx)) + \ (imag(R_tmpx) * imag(R_tmpx)) + \ (real(R_tmpy) * real(R_tmpy)) + \ (imag(R_tmpy) * imag(R_tmpy)) + \ (real(R_tmpz) * real(R_tmpz)) + \ (imag(R_tmpz) * imag(R_tmpz)); Rx(0,0,0) = R_tmpx; Ry(0,0,0) = R_tmpy; Rz(0,0,0) = R_tmpz; } """).render(type=_get_cuda_type(dtype)) # Compile the code using gce.Kernel grid_names = [ A + i for A in ['S', 'X', 'X1', 'P', 'T', 'R', 'R_hatH'] for i in ['x', 'y', 'z'] ] rho_biCGSTAB_fun = Kernel(code, \ ('alpha', 'number', dtype), \ ('omega', 'number', dtype), \ ('rho', 'out', dtype), \ ('err', 'out', dtype), \ *[(name, 'grid', dtype) for name in grid_names], \ shape_filter='skinny') # Temporary values that are needed. CG-variable that are stored on the gpu in a gce.Out type (child of gce.data) rho_out = Out(dtype) err_out = Out(dtype) X_temp = [Grid(dtype, x_overlap=1) for k in range(3)] # Used to swap p. # Define the actual function. def rho_biCGSTAB_step(alpha, omega, S, X, P, T, R, R_hatH): rho_biCGSTAB_fun(dtype(alpha), dtype(omega), rho_out, err_out, \ *( S + X + X_temp + P + T + R + R_hatH ), \ post_sync= X_temp + R ) # r must be post-synced for upcoming alpha step. X[:], X_temp[:] = X_temp[:], X[:] # deep swap X return rho_out.get(), np.sqrt(err_out.get()) return rho_biCGSTAB_step