Пример #1
0
def make_gpu_norm(dtype):
    """ Returns a function c=vec_norm(A) that does c=sqrt(A'A) """
    # GPU Code in gce.kernel.
    code = Template("""
        if (_in_global) {
            norm_a += conj(Ax(0,0,0))*Ax(0,0,0);
            norm_a += conj(Ay(0,0,0))*Ay(0,0,0);
            norm_a += conj(Az(0,0,0))*Az(0,0,0);
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code using gce.Kernel
    grid_names = [A + i for A in ['A'] for i in ['x', 'y', 'z']]
    prod_fun = Kernel(code, \
                    ('norm_a', 'out', dtype), \
                    *[(name, 'grid', dtype) for name in grid_names], \
                    shape_filter='skinny')
    norm_a = Out(dtype)

    # Define the actual function.
    def gpu_norm(A):
        prod_fun( norm_a,\
                  *( A )) # remove the post_sync
        return np.sqrt(norm_a.get())

    return gpu_norm
Пример #2
0
def make_gpu_sum(dtype):
    """ Returns a function that does aA+bB=C """
    # Code for the rho step function.
    code = Template("""
        if (_in_global) {
            Cx(0,0,0) = a*Ax(0,0,0) + b*Bx(0,0,0);
            Cy(0,0,0) = a*Ay(0,0,0) + b*By(0,0,0);
            Cz(0,0,0) = a*Az(0,0,0) + b*Bz(0,0,0);
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code using gce.Kernel
    grid_names = [A + i for A in ['A', 'B', 'C'] for i in ['x', 'y', 'z']]
    Sum_fun = Kernel(code, \
                    ('a', 'number', dtype), \
                    ('b', 'number', dtype), \
                    *[(name, 'grid', dtype) for name in grid_names], \
                    shape_filter='skinny')

    # Define the actual function.
    def gpu_sum(a, b, A, B, C):
        Sum_fun(dtype(a), dtype(b), \
                         *( A + B + C), \
                         post_sync=C) # r must be post-synced for upcoming alpha step.

    return gpu_sum
Пример #3
0
def make_gpu_dot(dtype):
    """ Returns a function c=vec_dot(A, B) that does c=A'B """
    # GPU Code in gce.kernel.
    code = Template("""
        if (_in_global) {
            dot_ab += conj(Ax(0,0,0))*Bx(0,0,0);
            dot_ab += conj(Ay(0,0,0))*By(0,0,0);
            dot_ab += conj(Az(0,0,0))*Bz(0,0,0);
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code using gce.Kernel
    grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']]
    prod_fun = Kernel(code, \
                    ('dot_ab', 'out', dtype), \
                    *[(name, 'grid', dtype) for name in grid_names], \
                    shape_filter='skinny')

    dot_ab = Out(dtype)

    # Define the actual function.
    def gpu_dot(A, B):
        prod_fun( dot_ab,\
                  *( A + B))
        return dot_ab.get()

    return gpu_dot
Пример #4
0
def rho_step(dtype):
    """ Return the function to execute the rho step of the bicg algorithm. """

    # Code for the rho step function.
    code = Template("""
        if (_in_global) {
            x0(0,0,0) = x0(0,0,0) + alpha * p0(0,0,0);
            x1(0,0,0) = x1(0,0,0) + alpha * p1(0,0,0);
            x2(0,0,0) = x2(0,0,0) + alpha * p2(0,0,0);
            {{ type }} s0 = r0(0,0,0) - alpha * v0(0,0,0);
            {{ type }} s1 = r1(0,0,0) - alpha * v1(0,0,0);
            {{ type }} s2 = r2(0,0,0) - alpha * v2(0,0,0);
            rho += (s0 * s0) + (s1 * s1) + (s2 * s2);
            err +=  (real(s0) * real(s0)) + \
                    (imag(s0) * imag(s0)) + \
                    (real(s1) * real(s1)) + \
                    (imag(s1) * imag(s1)) + \
                    (real(s2) * real(s2)) + \
                    (imag(s2) * imag(s2));
            r0(0,0,0) = s0;
            r1(0,0,0) = s1;
            r2(0,0,0) = s2;
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code.
    grid_names = [A + i for A in ['p', 'r', 'v', 'x'] for i in ['0', '1', '2']]
    rho_fun = Kernel(code, \
                    ('alpha', 'number', dtype), \
                    ('rho', 'out', dtype), \
                    ('err', 'out', dtype), \
                    *[(name, 'grid', dtype) for name in grid_names], \
                    shape_filter='skinny')

    # Temporary values that are needed.
    rho_out = Out(dtype)
    err_out = Out(dtype)

    # Define the actual function.
    def rho_step(alpha, p, r, v, x):
        rho_fun(dtype(alpha), rho_out, err_out, *(p + r + v + x), \
                post_sync=r) # r must be post-synced for upcoming alpha step.
        return rho_out.get(), np.sqrt(err_out.get())

    return rho_step
Пример #5
0
def make_gpu_copy(dtype):
    """ Returns a function that does B=A """
    # Code for the rho step function.
    code = Template("""
        if (_in_global) {
            Bx(0,0,0) = Ax(0,0,0);
            By(0,0,0) = Ay(0,0,0);
            Bz(0,0,0) = Az(0,0,0);
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code using gce.Kernel
    grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']]
    copy_fun = Kernel(code, \
                    *[(name, 'grid', dtype) for name in grid_names], \
                    shape_filter='skinny')

    # Define the actual function.
    def gpu_copy(A, B):
        copy_fun( \
                         *( A + B), \
                         post_sync=B) # r must be post-synced for upcoming alpha step.

    return gpu_copy
Пример #6
0
def make_gpu_cond(dtype, cond):
    """ Returns a function gpu_cond(A) that does A=A*C """
    # GPU Code in gce.Kernel
    code = Template("""
        if (_in_global) {
            Ax(0,0,0) = Ax(0,0,0)*Cx(0,0,0);
            Ay(0,0,0) = Ay(0,0,0)*Cy(0,0,0);
            Az(0,0,0) = Az(0,0,0)*Cz(0,0,0);
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code using gce.Kernel
    grid_names = [A + i for A in ['A', 'C'] for i in ['x', 'y', 'z']]
    Sum_fun = Kernel(code, \
                    *[(name, 'grid', dtype) for name in grid_names], \
                    shape_filter='skinny')

    C = cond

    # Define the actual function.
    def gpu_cond(A):
        Sum_fun(*( A + C ), \
                 post_sync=A) # r must be post-synced for upcoming alpha step.

    return gpu_cond
Пример #7
0
def make_gpu_scaled_copy(dtype):
    """ Returns a function vec_scaled_copy(A, a, B) that does B=aA """
    # GPU code for the Kernel
    code = Template("""
        if (_in_global) {
            Bx(0,0,0) = a*Ax(0,0,0);
            By(0,0,0) = a*Ay(0,0,0);
            Bz(0,0,0) = a*Az(0,0,0);
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code using gce.Kernel
    grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']]
    Sum_fun = Kernel( code, \
                      ('a', 'number', dtype), \
                      *[(name, 'grid', dtype) for name in grid_names], \
                      shape_filter='skinny')

    # Define the actual function.
    def gpu_scaled_copy(A, a, B):
        Sum_fun(dtype(a), \
                *( A + B ), \
                post_sync=B)

    return gpu_scaled_copy
Пример #8
0
def make_gpu_addvec(dtype):
    """ Returns a function vec_addvec(A, b, B) that does A=A+bB """
    # GPU Code in gce.Kernel
    code = Template("""
        if (_in_global) {
            Ax(0,0,0) = Ax(0,0,0) + b*Bx(0,0,0);
            Ay(0,0,0) = Ay(0,0,0) + b*By(0,0,0);
            Az(0,0,0) = Az(0,0,0) + b*Bz(0,0,0);
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code using gce.Kernel
    grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']]
    Sum_fun = Kernel(code, \
                    ('b', 'number', dtype), \
                    *[(name, 'grid', dtype) for name in grid_names], \
                    shape_filter='skinny')

    # Define the actual function.
    def gpu_addvec(A, b, B):
        Sum_fun( dtype(b), \
                *( A + B ), \
                 post_sync=A) # r must be post-synced for upcoming alpha step.

    return gpu_addvec
Пример #9
0
def make_gpu_scale(dtype):
    """ Returns a function scale(A, a) that does A=aA """
    # Code for the rho step function.
    code = Template("""
        if (_in_global) {
            Ax(0,0,0) = a*Ax(0,0,0);
            Ay(0,0,0) = a*Ay(0,0,0);
            Az(0,0,0) = a*Az(0,0,0);
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code using gce.Kernel
    grid_names = [A + i for A in ['A'] for i in ['x', 'y', 'z']]
    Sum_fun = Kernel(code, \
                    ('a', 'number', dtype), \
                    *[(name, 'grid', dtype) for name in grid_names], \
                    shape_filter='skinny')

    # Define the actual function.
    def gpu_scale(A, a):
        Sum_fun(dtype(a), \
                *( A ), \
                 post_sync=A) # r must be post-synced for upcoming alpha step.

    return gpu_scale
Пример #10
0
def make_gpu_fdfd_matrix_multiplication(params, dtype):
    """ Return function vec_matrix_multiplication(X, B) that will do AX=B """

    num_shared_banks = 6

    # Render the pre-loop and in-loop code.
    cuda_type = _get_cuda_type(dtype)
    code_allpre = jinja_env.get_template('fdfd_matrix_multiplication_pec_pmc.cu').\
                    render(dims=params['shape'], \
                            type=cuda_type, \
                            mu_equals_1=False, \
                            full_operator=True)

    # Grid input parameters.
    grid_params = [(A + i, 'grid', dtype) for A in ['X', 'B', 'e', 'm'] \
                                            for i in ['x', 'y', 'z']]

    # Const input parameters.
    const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \
                    ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \
                    'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \
                    ('bloch_x', 'bloch_y', 'bloch_z')
    const_sizes = params['shape'] * 4 + tuple([3]) * 3
    const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \
                        for k in range(len(const_sizes))]
    const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6))

    # Compile. (note shape_filter = 'square')
    A_multiplication_fun = Kernel('', \
                                  *(grid_params + const_params), \
                                  pre_loop=code_allpre, \
                                  padding=(1,1,1,1), \
                                  smem_per_thread=num_shared_banks*16, \
                                  shape_filter='square')

    # Temporary variables.

    # Grid variables.
    # !!!!! here eps is scattered over the GPUs when e intitialised
    e = [Grid(dtype(f), x_overlap=1) for f in params['e']]
    m = [Grid(dtype(f), x_overlap=1) for f in params['m']]  # Optional.

    # Constant variables.
    sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']]
    sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']]
    sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']]
    sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']]
    bloch_x = [Const(dtype(params['bloch_phase'][0]))]
    bloch_y = [Const(dtype(params['bloch_phase'][1]))]
    bloch_z = [Const(dtype(params['bloch_phase'][2]))]
    pemc = [Const(params['pemc'])]

    # Define the function
    def gpu_fdfd_matrix_multiplication(X, B):
        # Execute cuda code.
        A_multiplication_fun( \
                    *(X + B + e + m + \
                        sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \
                        bloch_x + bloch_y + bloch_z + pemc), \
                    post_sync = B)

    return gpu_fdfd_matrix_multiplication
Пример #11
0
def alpha_step(params, dtype):
    """ Define the alpha step function needed for the bicg algorithm. """
    num_shared_banks = 6

    # Render the pre-loop and in-loop code.
    cuda_type = _get_cuda_type(dtype)
    code_allpre = jinja_env.get_template('alpha_allpre.cu').\
                    render(dims=params['shape'], \
                            type=cuda_type, \
                            mu_equals_1=False, \
                            full_operator=True)

    # Grid input parameters.
    grid_params = [(A + i, 'grid', dtype) for A in ['P', 'P1', 'R', 'V', 'e', 'm'] \
                                            for i in ['x', 'y', 'z']]

    # Const input parameters.
    const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \
                    ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \
                    'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1')
    const_sizes = params['shape'] * 4
    const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \
                        for k in range(len(const_sizes))]

    # Compile.
    alpha_fun = Kernel('', \
                    ('beta', 'number', dtype), \
                    ('alpha_denom', 'out', dtype), \
                    *(grid_params + const_params), \
                    pre_loop=code_allpre, \
                    padding=(1,1,1,1), \
                    smem_per_thread=num_shared_banks*16, \
                    shape_filter='square')

    # Temporary variables.
    alpha_denom_out = Out(dtype)
    p_temp = [Grid(dtype, x_overlap=1) for k in range(3)]  # Used to swap p.

    # Grid variables.
    e = [Grid(dtype(f), x_overlap=1) for f in params['e']]
    m = [Grid(dtype(f), x_overlap=1) for f in params['m']]  # Optional.

    # Constant variables.
    sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']]
    sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']]
    sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']]
    sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']]

    # Define the function
    def alpha_step(rho_k, rho_k_1, p, r, v):
        # Execute cuda code.
        # Notice that p_temp and v are post_synced.
        alpha_fun(dtype(rho_k/rho_k_1), alpha_denom_out, \
                    *(p + p_temp + r + v + e + m + \
                        sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1), \
                    post_sync=p_temp+v)
        p[:], p_temp[:] = p_temp[:], p[:]  # Deep swap.

        return rho_k / alpha_denom_out.get()  # The value of alpha.

    return alpha_step
Пример #12
0
def omega_biCGSTAB_step(params, dtype):
    """ Define the alpha step function needed for the bicg algorithm. """
    '''
    This returns a function that will perform the alpha_step, i.e. a part of the CG algorithm
        s = r - alpha * v
        t = A*s
        omega=(t*s)/(t*t)
    the function returns alpha
    note that in omega fun does not calculate calculate since t and s are scattered over the different MPI nodes
    however omega_num and omega_denom are calculated, or at least the part that the MPInode can calculate.,
    and then put together by omega_step.
    '''
    num_shared_banks = 6

    # Render the pre-loop and in-loop code.
    cuda_type = _get_cuda_type(dtype)
    code_allpre = jinja_env.get_template('omega_bloch_pmc_pec.cu').\
                    render(dims=params['shape'], \
                            type=cuda_type, \
                            mu_equals_1=False, \
                            full_operator=True)

    # Grid input parameters.
    grid_params = [(A + i, 'grid', dtype) for A in ['V', 'S', 'R', 'T', 'e', 'm'] \
                                            for i in ['x', 'y', 'z']]

    # Const input parameters.
    const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \
                    ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \
                    'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \
                    ('bloch_x', 'bloch_y', 'bloch_z')
    const_sizes = params['shape'] * 4 + tuple([3]) * 3
    const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \
                        for k in range(len(const_sizes))]
    const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6))

    # Compile. (note shape_filter = 'square')
    omega_fun = Kernel('', \
                    ('alpha', 'number', dtype), \
                    ('omega_num', 'out', dtype), \
                    ('omega_denom', 'out', dtype), \
                    *(grid_params + const_params), \
                    pre_loop=code_allpre, \
                    padding=(1,1,1,1), \
                    smem_per_thread=num_shared_banks*16, \
                    shape_filter='square')

    # Temporary variables.
    omega_num_out = Out(dtype)
    omega_denom_out = Out(dtype)

    # Grid variables.
    e = [Grid(dtype(f), x_overlap=1) for f in params['e']]
    m = [Grid(dtype(f), x_overlap=1) for f in params['m']]  # Optional.

    # Constant variables.
    sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']]
    sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']]
    sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']]
    sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']]
    bloch_x = [Const(dtype(params['bloch_phase'][0]))]
    bloch_y = [Const(dtype(params['bloch_phase'][1]))]
    bloch_z = [Const(dtype(params['bloch_phase'][2]))]
    pemc = [Const(params['pemc'])]

    # Define the function
    def omega_step(alpha, V, S, R, T, compute_omega=True):
        # Execute cuda code.
        # Notice that H, S and T are post_synced.
        omega_fun(dtype(alpha), omega_num_out, omega_denom_out, \
                    *( V + S + R + T + e + m + \
                        sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \
                        bloch_x + bloch_y + bloch_z + pemc), \
                    post_sync=  S + T )

        if compute_omega:
            return omega_num_out.get() / omega_denom_out.get(
            )  # The value of omega.

    return omega_step
Пример #13
0
def alpha_biCGSTAB_step(params, dtype):
    """ Define the alpha step function needed for the bicg algorithm. """
    '''
    This returns a function that will perform the alpha_step, i.e. a part of the biCGSTAB algorithm
        p=r+rho[k]/rho[k+1]*alpha/omega*(p-omega*v)
        v=A*p
        alpha=rho/(p*v)
    the function returns alpha
    note that in alpha fun does not calculate alpha since p and v are scattered over the different MPI nodes
    however alpha_denom is calculated, or at least the part that the MPInode can calculate., and then put together
    by alpha_step.
    '''

    num_shared_banks = 6

    # Render the pre-loop and in-loop code.
    cuda_type = _get_cuda_type(dtype)
    #code_allpre = jinja_env.get_template('alpha_biCGSTAB.cu').\
    code_allpre = jinja_env.get_template('alpha_bloch_pmc_pec.cu').\
                    render(dims=params['shape'], \
                            type=cuda_type, \
                            mu_equals_1=False, \
                            full_operator=True)

    # Grid input parameters.
    grid_params = [(A + i, 'grid', dtype) for A in ['P', 'P1', 'R', 'R_hatH', 'V', 'V1', 'e', 'm'] \
                                            for i in ['x', 'y', 'z']]

    # Const input parameters.
    const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \
                    ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \
                    'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \
                    ('bloch_x', 'bloch_y', 'bloch_z')
    const_sizes = params['shape'] * 4 + tuple([3]) * 3
    const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \
                        for k in range(len(const_sizes))]
    const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6))

    # Compile. (note shape_filter = 'square')
    alpha_fun = Kernel('', \
                    ('beta', 'number', dtype), \
                    ('omega', 'number', dtype), \
                    ('alpha_denom', 'out', dtype), \
                    *(grid_params + const_params), \
                    pre_loop=code_allpre, \
                    padding=(1,1,1,1), \
                    smem_per_thread=num_shared_banks*16, \
                    shape_filter='square')

    # Temporary variables.
    alpha_denom_out = Out(dtype)
    P_temp = [Grid(dtype, x_overlap=1) for k in range(3)]  # Used to swap p.
    V_temp = [Grid(dtype, x_overlap=1) for k in range(3)]  # Used to swap v.

    # Grid variables.
    # !!!!! here eps is scattered over the GPUs when e intitialised
    e = [Grid(dtype(f), x_overlap=1) for f in params['e']]
    m = [Grid(dtype(f), x_overlap=1) for f in params['m']]  # Optional.

    # Constant variables.
    sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']]
    sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']]
    sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']]
    sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']]
    bloch_x = [Const(dtype(params['bloch_phase'][0]))]
    bloch_y = [Const(dtype(params['bloch_phase'][1]))]
    bloch_z = [Const(dtype(params['bloch_phase'][2]))]
    pemc = [Const(params['pemc'])]

    # Define the function
    def alpha_biCGSTAB_step(rho_k,
                            rho_k_1,
                            alpha,
                            omega,
                            P,
                            R,
                            R_hatH,
                            V,
                            compute_alpha=True):
        # Execute cuda code.
        # Notice that p_temp and v are post_synced.
        alpha_fun(dtype((rho_k*alpha)/(rho_k_1*omega)), dtype(omega), alpha_denom_out, \
                    *(P + P_temp + R + R_hatH + V + V_temp + e + m + \
                        sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \
                        bloch_x + bloch_y + bloch_z + pemc), \
                    post_sync = P_temp + V_temp)
        P[:], P_temp[:] = P_temp[:], P[:]  # Deep swap.
        V[:], V_temp[:] = V_temp[:], V[:]  # Deep swap

        # TODO(logansu): Remove compute_alpha solve_symm_lumped does not use
        # alpha_step to solve for the matrix. Because solve_symm_lumped sets
        # r to zero vector to compute the matrix multiplication, alpha_denom_out
        # comes out to be zero. The if-statement stops DivisionByZero when this
        # happens (which is important for us to catch real DivisionByZero
        # errors).
        if compute_alpha:
            return rho_k / alpha_denom_out.get()  # The value of alpha.

    return alpha_biCGSTAB_step
Пример #14
0
def rho_biCGSTAB_step(dtype):
    """ Return the function to execute the rho step of the bicg algorithm. """
    '''
    This returns a function that will perform the rho_step, i.e. a part of the 
    biCGSTAB algorithm
        x=x+alpha*p+omega*s
        r=s-omega*t
        rho[k+1]=r_hatH*r
        err=conj(r)*r
    the function returns rho[k+1] and err (it is returned to the CPU where it is 
    gathered and summed!)
    '''
    # Code for the rho step function.
    code = Template("""
        if (_in_global) {
            X1x(0,0,0) = Xx(0,0,0) + alpha*Px(0,0,0) + omega * Sx(0,0,0);
            X1y(0,0,0) = Xy(0,0,0) + alpha*Py(0,0,0) + omega * Sy(0,0,0);
            X1z(0,0,0) = Xz(0,0,0) + alpha*Pz(0,0,0) + omega * Sz(0,0,0);
            {{ type }} R_tmpx = Sx(0,0,0) - omega * Tx(0,0,0);
            {{ type }} R_tmpy = Sy(0,0,0) - omega * Ty(0,0,0);
            {{ type }} R_tmpz = Sz(0,0,0) - omega * Tz(0,0,0);
            rho += (R_hatHx(0,0,0) * R_tmpx) + (R_hatHy(0,0,0) * R_tmpy) + (R_hatHz(0,0,0) * R_tmpz);
            err +=  (real(R_tmpx) * real(R_tmpx)) + \
                    (imag(R_tmpx) * imag(R_tmpx)) + \
                    (real(R_tmpy) * real(R_tmpy)) + \
                    (imag(R_tmpy) * imag(R_tmpy)) + \
                    (real(R_tmpz) * real(R_tmpz)) + \
                    (imag(R_tmpz) * imag(R_tmpz));
            Rx(0,0,0) = R_tmpx;
            Ry(0,0,0) = R_tmpy;
            Rz(0,0,0) = R_tmpz;
        } """).render(type=_get_cuda_type(dtype))

    # Compile the code using gce.Kernel
    grid_names = [
        A + i for A in ['S', 'X', 'X1', 'P', 'T', 'R', 'R_hatH']
        for i in ['x', 'y', 'z']
    ]
    rho_biCGSTAB_fun = Kernel(code, \
                    ('alpha', 'number', dtype), \
                    ('omega', 'number', dtype), \
                    ('rho', 'out', dtype), \
                    ('err', 'out', dtype), \
                    *[(name, 'grid', dtype) for name in grid_names], \
                    shape_filter='skinny')

    # Temporary values that are needed. CG-variable that are stored on the gpu in a gce.Out type (child of gce.data)
    rho_out = Out(dtype)
    err_out = Out(dtype)
    X_temp = [Grid(dtype, x_overlap=1) for k in range(3)]  # Used to swap p.

    # Define the actual function.
    def rho_biCGSTAB_step(alpha, omega, S, X, P, T, R, R_hatH):
        rho_biCGSTAB_fun(dtype(alpha), dtype(omega), rho_out, err_out, \
                         *( S + X + X_temp + P + T + R + R_hatH ), \
                         post_sync= X_temp + R ) # r must be post-synced for upcoming alpha step.
        X[:], X_temp[:] = X_temp[:], X[:]  # deep swap X

        return rho_out.get(), np.sqrt(err_out.get())

    return rho_biCGSTAB_step