def trans(psy): ''' Transformation routine for use with PSyclone. Applies the OpenCL transform to the first Invoke in the psy object. :param psy: the PSy object which this script will transform. :type psy: :py:class:`psyclone.psyGen.PSy` :returns: the transformed PSy object. :rtype: :py:class:`psyclone.psyGen.PSy` ''' # Get the Schedule associated with the first Invoke invoke = psy.invokes.invoke_list[0] sched = invoke.schedule # Convert any kernel accesses to global data into arguments ktrans = KernelGlobalsToArguments() for kern in sched.kernels(): ktrans.apply(kern) # Transform the Schedule cltrans = OCLTrans() cltrans.apply(sched, options={"end_barrier": True}) # Provide kernel-specific OpenCL optimization options move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans() for kern in sched.kernels(): # Move the PSy-layer loop boundaries inside the kernel as a kernel # mask, this allows to iterate through the whole domain move_boundaries_trans.apply(kern) # Specify the OpenCL queue and workgroup size of the kernel kern.set_opencl_options({"queue_number": 1, 'local_size': 4}) return psy
def test_set_kern_args_real_grid_property(): ''' Check that we generate correct code to set a real scalar grid property. ''' psy, _ = get_invoke("driver_test.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen) expected = '''\ SUBROUTINE compute_kernel_code_set_args(kernel_obj, out_fld, in_out_fld, \ in_fld, dx, dx_1, gphiu, xstart, xstop, ystart, ystop) USE clfortran, ONLY: clSetKernelArg USE iso_c_binding, ONLY: c_sizeof, c_loc, c_intptr_t USE ocl_utils_mod, ONLY: check_status INTEGER(KIND=c_intptr_t), intent(in), target :: out_fld, in_out_fld, \ in_fld, dx, gphiu REAL(KIND=go_wp), intent(in), target :: dx_1 INTEGER, intent(in), target :: xstart, xstop, ystart, ystop''' assert expected in generated_code
def trans(psy): ''' Transformation routine for use with PSyclone. Applies the OpenCL transform to the first Invoke in the psy object. :param psy: the PSy object which this script will transform. :type psy: :py:class:`psyclone.psyGen.PSy` :returns: the transformed PSy object. :rtype: :py:class:`psyclone.psyGen.PSy` ''' ocl_trans = OCLTrans() fold_trans = FoldConditionalReturnExpressionsTrans() move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans() # Get the Schedule associated with the first Invoke invoke = psy.invokes.invoke_list[0] sched = invoke.schedule # Transform the Schedule ocl_trans.apply(sched, options={"end_barrier": True}) # Provide kernel-specific OpenCL optimization options for kern in sched.kernels(): # Move the PSy-layer loop boundaries inside the kernel as a kernel # mask, this allows to iterate through the whole domain move_boundaries_trans.apply(kern) # Change the syntax to remove the return statements introduced by the # previous transformation fold_trans.apply(kern.get_kernel_schedule()) # Specify the OpenCL queue and workgroup size of the kernel kern.set_opencl_options({"queue_number": 1, 'local_size': 4}) return psy
def test_set_kern_args(kernel_outputdir): ''' Check that we generate the necessary code to set kernel arguments. ''' psy, _ = get_invoke("single_invoke_two_kernels.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen) # Check we've only generated one set-args routine with arguments: # kernel object + kernel arguments + boundary values assert generated_code.count("SUBROUTINE compute_cu_code_set_args(" "kernel_obj, cu_fld, p_fld, u_fld, xstart, " "xstop, ystart, ystop)") == 1 # Declarations expected = '''\ USE clfortran, ONLY: clSetKernelArg USE iso_c_binding, ONLY: c_sizeof, c_loc, c_intptr_t USE ocl_utils_mod, ONLY: check_status INTEGER(KIND=c_intptr_t), intent(in), target :: cu_fld, p_fld, u_fld INTEGER, intent(in), target :: xstart, xstop, ystart, ystop INTEGER ierr INTEGER(KIND=c_intptr_t), target :: kernel_obj''' assert expected in generated_code expected = '''\ ! Set the arguments for the compute_cu_code OpenCL Kernel ierr = clSetKernelArg(kernel_obj, 0, C_SIZEOF(cu_fld), C_LOC(cu_fld)) CALL check_status('clSetKernelArg: arg 0 of compute_cu_code', ierr) ierr = clSetKernelArg(kernel_obj, 1, C_SIZEOF(p_fld), C_LOC(p_fld)) CALL check_status('clSetKernelArg: arg 1 of compute_cu_code', ierr) ierr = clSetKernelArg(kernel_obj, 2, C_SIZEOF(u_fld), C_LOC(u_fld)) CALL check_status('clSetKernelArg: arg 2 of compute_cu_code', ierr) ierr = clSetKernelArg(kernel_obj, 3, C_SIZEOF(xstart), C_LOC(xstart)) CALL check_status('clSetKernelArg: arg 3 of compute_cu_code', ierr) ierr = clSetKernelArg(kernel_obj, 4, C_SIZEOF(xstop), C_LOC(xstop)) CALL check_status('clSetKernelArg: arg 4 of compute_cu_code', ierr) ierr = clSetKernelArg(kernel_obj, 5, C_SIZEOF(ystart), C_LOC(ystart)) CALL check_status('clSetKernelArg: arg 5 of compute_cu_code', ierr) ierr = clSetKernelArg(kernel_obj, 6, C_SIZEOF(ystop), C_LOC(ystop)) CALL check_status('clSetKernelArg: arg 6 of compute_cu_code', ierr) END SUBROUTINE compute_cu_code_set_args''' assert expected in generated_code # The call to the set_args matches the expected kernel signature with # the boundary values converted to 0-indexing assert ("CALL compute_cu_code_set_args(kernel_compute_cu_code, " "cu_fld%device_ptr, p_fld%device_ptr, u_fld%device_ptr, " "xstart - 1, xstop - 1, " "ystart - 1, ystop - 1)" in generated_code) # There is also only one version of the set_args for the second kernel assert generated_code.count("SUBROUTINE time_smooth_code_set_args(" "kernel_obj, u_fld, unew_fld, uold_fld, " "xstart_1, xstop_1, ystart_1, ystop_1)") == 1 assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
def test_validation(): ''' Check that the transformation can only be applied to routine nodes ''' trans = GOMoveIterationBoundariesInsideKernelTrans() with pytest.raises(TransformationError) as info: trans.apply(None) assert("Error in GOMoveIterationBoundariesInsideKernelTrans " "transformation. This transformation can only be applied to " "'GOKern' nodes, but found 'NoneType'." in str(info.value))
def test_field_arguments(kernel_outputdir): # pylint: disable=unused-argument ''' Test that with an invoke transformed to OpenCL that has fields, all the fields are initialized into OpenCL buffers, the data is copied in, and a function to get the data back from the device is also generated.''' psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen).lower() # The array size expression always uses the same field, in this case p_fld size_expression = "int(p_fld%grid%nx*p_fld%grid%ny, 8)" # For each of the invoke fields, add a conditional block that: # - Creates a OpenCL rw buffer. # - Writes the field data into the buffer. # - Marks data_on_device flag as true. # - Points the read_from_device_f attribute to the read_from_device # local function. # - Blocks OpenCL Queue until the data copy has finished. single_invoke_fields = ["cu_fld", "p_fld", "u_fld"] for field in single_invoke_fields: expected = ( " if (.not. {0}%data_on_device) then\n" " size_in_bytes = " + size_expression + "*c_sizeof({0}%data(1,1))\n" " ! create buffer on device\n" " {0}%device_ptr = create_rw_buffer(size_in_bytes)\n" " ierr = clenqueuewritebuffer(cmd_queues(1), " "{0}%device_ptr, cl_true, 0_8, size_in_bytes, " "c_loc({0}%data), 0, c_null_ptr, " "c_loc(write_event))\n" " {0}%data_on_device = .true.\n" " {0}%read_from_device_f => read_from_device\n" " ! block until data copies have finished\n" " ierr = clfinish(cmd_queues(1))\n" " end if\n").format(field) assert expected in generated_code # Check that the read_from_device routine has also been generated. expected = (" subroutine read_from_device(from, to, nx, ny, width)\n" " use iso_c_binding, only: c_intptr_t\n" " use fortcl, only: read_buffer\n" " integer(kind=c_intptr_t), intent(in) :: from\n" " real(kind=go_wp), intent(inout), dimension(:,:) :: to\n" " integer, intent(in) :: nx, ny, width\n" " call read_buffer(from, to, int(width*ny, kind=8))\n" " end subroutine read_from_device\n") assert expected in generated_code
def test_opencl_options_validation(): ''' Check that OpenCL options which are not supported provide appropiate errors. ''' psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() # Unsupported options are not accepted with pytest.raises(TransformationError) as err: otrans.apply(sched, options={'unsupported': 1}) assert "InvokeSchedule does not support the OpenCL option 'unsupported'." \ in str(err.value) # end_barrier option must be a boolean with pytest.raises(TransformationError) as err: otrans.apply(sched, options={'end_barrier': 1}) assert "InvokeSchedule OpenCL option 'end_barrier' should be a boolean." \ in str(err.value) # enable_profiling option must be a boolean with pytest.raises(TransformationError) as err: otrans.apply(sched, options={'enable_profiling': 1}) assert ("InvokeSchedule OpenCL option 'enable_profiling' should be a " "boolean." in str(err.value)) # out_of_order option must be a boolean with pytest.raises(TransformationError) as err: otrans.apply(sched, options={'out_of_order': 1}) assert "InvokeSchedule OpenCL option 'out_of_order' should be a boolean." \ in str(err.value) # Unsupported kernel options are not accepted with pytest.raises(AttributeError) as err: sched.coded_kernels()[0].set_opencl_options({'unsupported': 1}) assert "CodedKern does not support the OpenCL option 'unsupported'." \ in str(err.value) # local_size must be an integer with pytest.raises(TypeError) as err: sched.coded_kernels()[0].set_opencl_options({'local_size': 'a'}) assert "CodedKern OpenCL option 'local_size' should be an integer." \ in str(err.value) # queue_number must be an integer with pytest.raises(TypeError) as err: sched.coded_kernels()[0].set_opencl_options({'queue_number': 'a'}) assert "CodedKern OpenCL option 'queue_number' should be an integer." \ in str(err.value)
def test_set_kern_float_arg(): ''' Check that we generate correct code to set a real, scalar kernel argument. ''' psy, _ = get_invoke("single_invoke_scalar_float_arg.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen) # This set_args has a name clash on xstop (one is a grid property and the # other a loop boundary). One of they should appear as 'xstop_1' expected = '''\ SUBROUTINE bc_ssh_code_set_args(kernel_obj, a_scalar, ssh_fld, xstop, \ tmask, xstart, xstop_1, ystart, ystop) USE clfortran, ONLY: clSetKernelArg USE iso_c_binding, ONLY: c_sizeof, c_loc, c_intptr_t USE ocl_utils_mod, ONLY: check_status INTEGER(KIND=c_intptr_t), intent(in), target :: ssh_fld, tmask INTEGER, intent(in), target :: xstop REAL(KIND=go_wp), intent(in), target :: a_scalar INTEGER, intent(in), target :: xstart, xstop_1, ystart, ystop INTEGER ierr INTEGER(KIND=c_intptr_t), target :: kernel_obj ''' assert expected in generated_code expected = '''\ ! Set the arguments for the bc_ssh_code OpenCL Kernel ierr = clSetKernelArg(kernel_obj, 0, C_SIZEOF(a_scalar), C_LOC(a_scalar)) CALL check_status('clSetKernelArg: arg 0 of bc_ssh_code', ierr) ierr = clSetKernelArg(kernel_obj, 1, C_SIZEOF(ssh_fld), C_LOC(ssh_fld)) CALL check_status('clSetKernelArg: arg 1 of bc_ssh_code', ierr) ierr = clSetKernelArg(kernel_obj, 2, C_SIZEOF(xstop), C_LOC(xstop)) CALL check_status('clSetKernelArg: arg 2 of bc_ssh_code', ierr) ierr = clSetKernelArg(kernel_obj, 3, C_SIZEOF(tmask), C_LOC(tmask)) CALL check_status('clSetKernelArg: arg 3 of bc_ssh_code', ierr) ierr = clSetKernelArg(kernel_obj, 4, C_SIZEOF(xstart), C_LOC(xstart)) CALL check_status('clSetKernelArg: arg 4 of bc_ssh_code', ierr) ierr = clSetKernelArg(kernel_obj, 5, C_SIZEOF(xstop_1), C_LOC(xstop_1)) CALL check_status('clSetKernelArg: arg 5 of bc_ssh_code', ierr) ierr = clSetKernelArg(kernel_obj, 6, C_SIZEOF(ystart), C_LOC(ystart)) CALL check_status('clSetKernelArg: arg 6 of bc_ssh_code', ierr) ierr = clSetKernelArg(kernel_obj, 7, C_SIZEOF(ystop), C_LOC(ystop)) CALL check_status('clSetKernelArg: arg 7 of bc_ssh_code', ierr) END SUBROUTINE bc_ssh_code_set_args''' assert expected in generated_code
def test_opencl_options_effects(): ''' Check that the OpenCL options produce the expected changes in the PSy layer. ''' psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen) # By default there is 1 queue, with an end barrier and local_size is 64 assert "localsize = (/64, 1/)" in generated_code assert "ierr = clEnqueueNDRangeKernel(cmd_queues(1), " \ "kernel_compute_cu_code, 2, C_NULL_PTR, C_LOC(globalsize), " \ "C_LOC(localsize), 0, C_NULL_PTR, C_NULL_PTR)" in generated_code assert "! Block until all kernels have finished\n" \ " ierr = clFinish(cmd_queues(1))" in generated_code assert "ierr = clFinish(cmd_queues(2))" not in generated_code # Change kernel local_size to 4 sched.coded_kernels()[0].set_opencl_options({'local_size': 4}) generated_code = str(psy.gen) assert "localsize = (/4, 1/)" in generated_code # Change kernel queue to 2 (the barrier should then also go up to 2) sched.coded_kernels()[0].set_opencl_options({'queue_number': 2}) generated_code = str(psy.gen) assert "ierr = clEnqueueNDRangeKernel(cmd_queues(2), " \ "kernel_compute_cu_code, 2, C_NULL_PTR, C_LOC(globalsize), " \ "C_LOC(localsize), 0, C_NULL_PTR, C_NULL_PTR)" in generated_code assert "! Block until all kernels have finished\n" \ " ierr = clFinish(cmd_queues(1))\n" \ " ierr = clFinish(cmd_queues(2))\n" in generated_code assert "ierr = clFinish(cmd_queues(3))" not in generated_code # Remove barrier at the end of the Invoke otrans.apply(sched, options={'end_barrier': False}) generated_code = str(psy.gen) assert "! Block until all kernels have finished" not in generated_code assert "ierr = clFinish(cmd_queues(2))" not in generated_code
def test_opencl_kernel_output_file_with_suffix(kernel_outputdir): '''Check that an OpenCL file named modulename_kernelname_0 is generated without the _code suffix in the kernelname ''' psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) _ = psy.gen # Generates the OpenCL kernels as a side-effect. assert os.path.exists( os.path.join(str(kernel_outputdir), "compute_cu_compute_cu_0.cl"))
def test_invoke_use_stmts_and_decls(kernel_outputdir, monkeypatch, debug_mode): ''' Test that generating code for OpenCL results in the correct module use statements and declarations. ''' api_config = Config.get().api_conf("gocean1.0") monkeypatch.setattr(api_config, "_debug_mode", debug_mode) psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen).lower() expected = '''\ subroutine invoke_0_compute_cu(cu_fld, p_fld, u_fld) use fortcl, only: create_rw_buffer\n''' # When in debug mode, import also the check_status function if debug_mode: expected += " use ocl_utils_mod, only: check_status\n" expected += '''\ use fortcl, only: get_num_cmd_queues, get_cmd_queues, get_kernel_by_name use clfortran use iso_c_binding type(r2d_field), intent(inout), target :: cu_fld, p_fld, u_fld integer xstart, xstop, ystart, ystop integer(kind=c_size_t), target :: localsize(2) integer(kind=c_size_t), target :: globalsize(2) integer(kind=c_intptr_t), target :: write_event integer(kind=c_size_t) size_in_bytes integer(kind=c_intptr_t), target, save :: kernel_compute_cu_code logical, save :: first_time=.true. integer ierr integer(kind=c_intptr_t), pointer, save :: cmd_queues(:) integer, save :: num_cmd_queues ''' assert expected in generated_code assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
def test_grid_proprty(kernel_outputdir): # pylint: disable=unused-argument ''' Test that using nx and ny from the gocean property dictionary works.''' psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen).lower() assert "globalsize = (/p_fld%grid%nx, p_fld%grid%ny/)" in generated_code expected = "size_in_bytes = int(p_fld%grid%nx*p_fld%grid%ny, 8)*" \ "c_sizeof(p_fld%data(1,1))" assert expected in generated_code
def test_invoke_opencl_initialisation(kernel_outputdir): ''' Test that generating code for OpenCL results in the correct OpenCL first time initialisation code ''' psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen).lower() # Test that the necessary variables are declared at the beginning # of the invoke expected = '''\ integer(kind=c_size_t), target :: localsize(2) integer(kind=c_size_t), target :: globalsize(2) integer(kind=c_intptr_t), target :: write_event integer(kind=c_size_t) size_in_bytes integer(kind=c_intptr_t), target, save :: kernel_compute_cu_code logical, save :: first_time=.true. integer ierr integer(kind=c_intptr_t), pointer, save :: cmd_queues(:) integer, save :: num_cmd_queues''' assert expected in generated_code # Test that a conditional 'first_time' code is generated with the # expected initialisation statements expected = '''\ if (first_time) then first_time = .false. ! ensure opencl run-time is initialised for this psy-layer module call psy_init num_cmd_queues = get_num_cmd_queues() cmd_queues => get_cmd_queues() kernel_compute_cu_code = get_kernel_by_name("compute_cu_code") end if''' assert expected in generated_code assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
def trans(psy): ''' Transformation routine for use with PSyclone. Converts any global-variable accesses into kernel arguments and then applies the OpenCL transformation to the PSy layer. :param psy: the PSy object which this script will transform. :type psy: :py:class:`psyclone.psyGen.PSy` :returns: the transformed PSy object. :rtype: :py:class:`psyclone.psyGen.PSy` ''' # Get the necessary transformations tinfo = TransInfo() globaltrans = tinfo.get_trans_name('KernelGlobalsToArguments') move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans() cltrans = tinfo.get_trans_name('OCLTrans') for invoke in psy.invokes.invoke_list: print("Converting to OpenCL invoke: " + invoke.name) schedule = invoke.schedule # Skip invoke_2 as its time_smooth_code kernel contains a # module variable (alpha) which is not dealt with by the # KernelGlobalsToArguments transformation, see issue #826. if invoke.name == "invoke_2": continue # Remove the globals from inside each kernel and move PSy-layer # loop boundaries inside the kernel as a mask. for kern in schedule.kernels(): print("Update kernel: " + kern.name) move_boundaries_trans.apply(kern) globaltrans.apply(kern) # Transform invoke to OpenCL cltrans.apply(schedule) return psy
def test_opencl_code_generation_with_boundary_mask(): ''' Tests that OpenCL kernel generated after applying the GOMoveIterationBoundariesInsideKernelTrans has the 4 boundary values as kernel arguments and has a masking statement at the beginning of the executable code. ''' psy, _ = get_invoke("single_invoke.f90", API, idx=0, dist_mem=False) sched = psy.invokes.invoke_list[0].schedule kernel = sched.children[0].loop_body[0].loop_body[0] # compute_cu kernel trans = GOMoveIterationBoundariesInsideKernelTrans() trans.apply(kernel) kschedule = kernel.get_kernel_schedule() expected_code = ("__kernel void compute_cu_code(\n" " __global double * restrict cu,\n" " __global double * restrict p,\n" " __global double * restrict u,\n" " int xstart,\n" " int xstop,\n" " int ystart,\n" " int ystop\n" " ){\n" " int cuLEN1 = get_global_size(0);\n" " int cuLEN2 = get_global_size(1);\n" " int pLEN1 = get_global_size(0);\n" " int pLEN2 = get_global_size(1);\n" " int uLEN1 = get_global_size(0);\n" " int uLEN2 = get_global_size(1);\n" " int i = get_global_id(0);\n" " int j = get_global_id(1);\n" " if ((((i < xstart) || (i > xstop)) || ((j < ystart) ||" " (j > ystop)))) {\n" " return;\n" " }\n" " cu[j * cuLEN1 + i] = ((0.5e0 * (p[j * pLEN1 + (i + 1)]" " + p[j * pLEN1 + i])) * u[j * uLEN1 + i]);\n" "}\n\n") openclwriter = OpenCLWriter() assert expected_code == openclwriter(kschedule)
def test_psy_init_with_options(kernel_outputdir): ''' Check that we create a psy_init() routine that sets-up the OpenCL environment with the provided non-default options. ''' psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched, options={ "end_barrier": True, "enable_profiling": True, "out_of_order": True }) generated_code = str(psy.gen) assert "CALL ocl_env_init(1, ocl_device_num, .True., .True.)\n" \ in generated_code assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
def test_opencl_multi_invoke_options_validation(option_to_check): ''' Check that the OpenCL options constrains are enforced when there are multiple invokes. ''' psy, _ = get_invoke("test12_two_invokes_two_kernels.f90", API, idx=0) invoke1_schedule = psy.invokes.invoke_list[0].schedule invoke2_schedule = psy.invokes.invoke_list[1].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in invoke1_schedule.coded_kernels(): trans.apply(kernel) for kernel in invoke2_schedule.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(invoke1_schedule, options={option_to_check: False}) otrans.apply(invoke2_schedule, options={option_to_check: True}) with pytest.raises(NotImplementedError) as err: _ = str(psy.gen) assert ("The current implementation creates a single OpenCL context for " "all the invokes which needs certain OpenCL options to match " "between invokes. Found '{0}' with unmatching values between " "invokes.".format(option_to_check) in str(err.value))
def trans(psy): ''' Transform the schedule for OpenCL generation ''' # Import transformations tinfo = TransInfo() globaltrans = tinfo.get_trans_name('KernelImportsToArguments') move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans() cltrans = GOOpenCLTrans() # Get the invoke routine schedule = psy.invokes.get('invoke_0').schedule # Map the kernels by their name to different OpenCL queues. The multiple # command queues can be executed concurrently while each command queue # executes in-order its kernels. This provides functional parallelism # when kernels don't have dependencies between them. qmap = { 'continuity_code': 1, 'momentum_u_code': 2, 'momentum_v_code': 3, 'bc_ssh_code': 1, 'bc_solid_u_code': 2, 'bc_solid_v_code': 3, 'bc_flather_u_code': 2, 'bc_flather_v_code': 3, 'field_copy_code': 1, 'next_sshu_code': 1, 'next_sshv_code': 1 } # Remove global variables from inside each kernel, pass the boundary # values as arguments to the kernel and set the OpenCL work size to 64, # which is required for performance (with OpenCL < 1.2 this requires # the resulting application to be executed with DL_ESM_ALIGNMENT=64). # Technically the OpenCL global_size (which is controlled by # DL_ESM_ALIGNMENT) must be divisible by the work_group_size (which # is set to 64 in the psyclone script) in OpenCL implementations < 2.0. # But from OpenCL 2.0 the standard says its not necessary anymore. # In practice it is safe to always use it as most implementations # are lacking in this aspect. # If using a different WORK_GROUP_SIZE, make sure to update the # DL_ESM_ALIGNMENT to match. for kern in schedule.kernels(): print(kern.name) globaltrans.apply(kern) if MOVE_BOUNDARIES: move_boundaries_trans.apply(kern) if FUCTIONAL_PARALLELISM: kern.set_opencl_options({ 'local_size': WORK_GROUP_SIZE, 'queue_number': qmap[kern.name] }) else: kern.set_opencl_options({'local_size': WORK_GROUP_SIZE}) # Transform invoke to OpenCL cltrans.apply(schedule) if XILINX_CONFIG_FILE: # Create a Xilinx Compiler Configuration file path = Config.get().kernel_output_dir with open(os.path.join(path, "xilinx.cfg"), "w") as cfgfile: cfgfile.write("# Xilinx FPGA configuration file\n") # cfgfile.write("[connectivity]\n") # cfgfile.write("# Create 2 CU of the given kernels\n") # cfgfile.write("nk=continuity_code:2\n") # cfgfile.write("nk=momentum_u_code:2\n") # cfgfile.write("nk=momentum_v_code:2\n") # cfgfile.write("nk=bc_ssh_code:2\n") # cfgfile.write("\n[hls]\n") # cfgfile.write("# Assign CUs to different SLRs\n") # cfgfile.write("slr=momentum_u_code_1:SLR0\n") # cfgfile.write("slr=momentum_u_code_2:SLR0\n") # cfgfile.write("slr=momentum_v_code_1:SLR2\n") # cfgfile.write("slr=momentum_v_code_2:SLR2\n") return psy
def test_psy_init(kernel_outputdir, monkeypatch): ''' Check that we create a psy_init() routine that sets-up the OpenCL environment. ''' psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen) expected = ( " SUBROUTINE psy_init()\n" " USE fortcl, ONLY: ocl_env_init, add_kernels\n" " CHARACTER(LEN=30) kernel_names(1)\n" " INTEGER :: ocl_device_num=1\n" " LOGICAL, save :: initialised=.False.\n" " ! Check to make sure we only execute this routine once\n" " IF (.not. initialised) THEN\n" " initialised = .True.\n" " ! Initialise the OpenCL environment/device\n" " CALL ocl_env_init(1, ocl_device_num, .False., .False.)\n" " ! The kernels this PSy layer module requires\n" " kernel_names(1) = \"compute_cu_code\"\n" " ! Create the OpenCL kernel objects. Expects to find all of " "the compiled\n" " ! kernels in FORTCL_KERNELS_FILE.\n" " CALL add_kernels(1, kernel_names)\n" " END IF\n" " END SUBROUTINE psy_init\n") assert expected in generated_code assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy) # Test with a non-default number of OpenCL queues sched.coded_kernels()[0].set_opencl_options({'queue_number': 5}) generated_code = str(psy.gen) expected = ( " SUBROUTINE psy_init()\n" " USE fortcl, ONLY: ocl_env_init, add_kernels\n" " CHARACTER(LEN=30) kernel_names(1)\n" " INTEGER :: ocl_device_num=1\n" " LOGICAL, save :: initialised=.False.\n" " ! Check to make sure we only execute this routine once\n" " IF (.not. initialised) THEN\n" " initialised = .True.\n" " ! Initialise the OpenCL environment/device\n" " CALL ocl_env_init(5, ocl_device_num, .False., .False.)\n" " ! The kernels this PSy layer module requires\n" " kernel_names(1) = \"compute_cu_code\"\n" " ! Create the OpenCL kernel objects. Expects to find all of " "the compiled\n" " ! kernels in FORTCL_KERNELS_FILE.\n" " CALL add_kernels(1, kernel_names)\n" " END IF\n" " END SUBROUTINE psy_init\n") assert expected in generated_code assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy) # Test with a different configuration value for OCL_DEVICES_PER_NODE # that needs a mod() and a get_rank() expression. monkeypatch.setattr(Config.get(), "_ocl_devices_per_node", 2) generated_code = str(psy.gen) expected = ( " SUBROUTINE psy_init()\n" " USE parallel_mod, ONLY: get_rank\n" " USE fortcl, ONLY: ocl_env_init, add_kernels\n" " CHARACTER(LEN=30) kernel_names(1)\n" " INTEGER :: ocl_device_num=1\n" " LOGICAL, save :: initialised=.False.\n" " ! Check to make sure we only execute this routine once\n" " IF (.not. initialised) THEN\n" " initialised = .True.\n" " ! Initialise the OpenCL environment/device\n" " ocl_device_num = mod(get_rank() - 1, 2) + 1\n" " CALL ocl_env_init(5, ocl_device_num, .False., .False.)\n" " ! The kernels this PSy layer module requires\n" " kernel_names(1) = \"compute_cu_code\"\n" " ! Create the OpenCL kernel objects. Expects to find all of " "the compiled\n" " ! kernels in FORTCL_KERNELS_FILE.\n" " CALL add_kernels(1, kernel_names)\n" " END IF\n" " END SUBROUTINE psy_init\n") assert expected in generated_code assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)
def test_go_move_iteration_boundaries_inside_kernel_trans(): ''' Tests that the GOMoveIterationBoundariesInsideKernelTrans transformation for the GOcean API adds the 4 boundary values as kernel arguments and adds a masking statement at the beginning of the code. ''' psy, _ = get_invoke("single_invoke.f90", API, idx=0, dist_mem=False) sched = psy.invokes.invoke_list[0].schedule kernel = sched.children[0].loop_body[0].loop_body[0] # compute_cu kernel num_args = len(kernel.arguments.args) # Add some name conflicting symbols in the Invoke and the Kernel kernel.root.symbol_table.new_symbol("xstop") kernel.get_kernel_schedule().symbol_table.new_symbol("ystart") # Apply the transformation trans = GOMoveIterationBoundariesInsideKernelTrans() trans.apply(kernel) # Check that the kernel call have been transformed: # - Immediately before the loop there are the boundaries assignments assert isinstance(sched.children[0], Assignment) assert sched.children[0].lhs.symbol.name == "xstart" assert isinstance(sched.children[1], Assignment) assert sched.children[1].lhs.symbol.name == "xstop_1" assert isinstance(sched.children[2], Assignment) assert sched.children[2].lhs.symbol.name == "ystart" assert isinstance(sched.children[3], Assignment) assert sched.children[3].lhs.symbol.name == "ystop" # - The loops have been transformed assert isinstance(sched.children[4], GOLoop) assert sched.children[4].field_space == "go_every" assert sched.children[4].iteration_space == "go_all_pts" assert isinstance(sched.children[4].loop_body[0], GOLoop) assert sched.children[4].loop_body[0].field_space == "go_every" assert sched.children[4].loop_body[0].iteration_space == "go_all_pts" # - And the appropriate arguments have been added to the kernel call assert len(kernel.arguments.args) == num_args + 4 assert kernel.arguments.args[-4].name == "xstart" assert kernel.arguments.args[-4].argument_type == "scalar" assert kernel.arguments.args[-3].name == "xstop_1" assert kernel.arguments.args[-3].argument_type == "scalar" assert kernel.arguments.args[-2].name == "ystart" assert kernel.arguments.args[-1].argument_type == "scalar" assert kernel.arguments.args[-1].name == "ystop" assert kernel.arguments.args[-1].argument_type == "scalar" # Check that the kernel subroutine has been transformed: kschedule = kernel.get_kernel_schedule() # - It has the boundary conditions mask assert isinstance(kschedule.children[0], IfBlock) assert str(kschedule.children[0].condition) == ( "BinaryOperation[operator:'OR']\n" "BinaryOperation[operator:'OR']\n" "BinaryOperation[operator:'LT']\n" "Reference[name:'i']\n" "Reference[name:'xstart']\n" "BinaryOperation[operator:'GT']\n" "Reference[name:'i']\n" "Reference[name:'xstop']\n" "BinaryOperation[operator:'OR']\n" "BinaryOperation[operator:'LT']\n" "Reference[name:'j']\n" "Reference[name:'ystart_1']\n" "BinaryOperation[operator:'GT']\n" "Reference[name:'j']\n" "Reference[name:'ystop']") assert isinstance(kschedule.children[0].if_body[0], Return) # - It has the boundary symbol as kernel arguments assert isinstance(kschedule.symbol_table.lookup("xstart").interface, ArgumentInterface) assert isinstance(kschedule.symbol_table.lookup("xstop").interface, ArgumentInterface) assert isinstance(kschedule.symbol_table.lookup("ystart_1").interface, ArgumentInterface) assert isinstance(kschedule.symbol_table.lookup("ystop").interface, ArgumentInterface)
def test_go_move_iteration_boundaries_inside_kernel_two_kernels_apply_twice(): ''' Tests that the GOMoveIterationBoundariesInsideKernelTrans transformation for the GOcean API produces the expected code when the invoke has two kernels and the transformation is applied twice. We check that the kernels don't use the same boundary values (some are postfixed with a number) and that kernels don't duplicate boundary arguments themself when applying the transformation twice. ''' psy, _ = get_invoke("single_invoke_two_kernels.f90", API, idx=0, dist_mem=False) sched = psy.invokes.invoke_list[0].schedule # Apply the transformation twice trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) trans.apply(kernel) expected = '''subroutine invoke_0(cu_fld, p_fld, u_fld, u_fld, unew_fld, \ uold_fld) use compute_cu_mod, only : compute_cu_code use time_smooth_mod, only : time_smooth_code type(r2d_type), intent(out) :: cu_fld type(r2d_type), intent(in) :: p_fld type(r2d_type), intent(in) :: u_fld type(r2d_type), intent(in) :: unew_fld type(r2d_type), intent(inout) :: uold_fld integer :: j integer :: xstart integer :: xstop integer :: ystart integer :: ystop integer :: xstart_1 integer :: xstop_1 integer :: ystart_1 integer :: ystop_1 integer :: i integer :: i_1 xstart = cu_fld%internal%xstart xstop = cu_fld%internal%xstop ystart = cu_fld%internal%ystart ystop = cu_fld%internal%ystop do j = 1, SIZE(cu_fld%data, 2), 1 do i = 1, SIZE(cu_fld%data, 1), 1 call compute_cu_code(i, j, cu_fld%data, p_fld%data, u_fld%data, xstart, \ xstop, ystart, ystop) enddo enddo xstart_1 = 1 xstop_1 = SIZE(uold_fld%data, 1) ystart_1 = 1 ystop_1 = SIZE(uold_fld%data, 2) do j = 1, SIZE(uold_fld%data, 2), 1 do i_1 = 1, SIZE(uold_fld%data, 1), 1 call time_smooth_code(i_1, j, u_fld%data, unew_fld%data, uold_fld%data, \ xstart_1, xstop_1, ystart_1, ystop_1) enddo enddo end subroutine invoke_0 ''' writer = FortranWriter() sched.lower_to_language_level() assert writer(sched) == expected
def test_invoke_opencl_kernel_call(kernel_outputdir, monkeypatch, debug_mode): ''' Check that the Invoke OpenCL produce the expected kernel enqueue statement to launch OpenCL kernels. ''' api_config = Config.get().api_conf("gocean1.0") monkeypatch.setattr(api_config, "_debug_mode", debug_mode) psy, _ = get_invoke("single_invoke.f90", API, idx=0) sched = psy.invokes.invoke_list[0].schedule # Currently, moving the boundaries inside the kernel is a prerequisite # for the GOcean gen_ocl() code generation. trans = GOMoveIterationBoundariesInsideKernelTrans() for kernel in sched.coded_kernels(): trans.apply(kernel) otrans = OCLTrans() otrans.apply(sched) generated_code = str(psy.gen) # Set up globalsize and localsize values expected = '''\ globalsize = (/p_fld%grid%nx, p_fld%grid%ny/) localsize = (/64, 1/)''' if debug_mode: # Check that the globalsize first dimension is a multiple of # the localsize first dimension expected += ''' IF (mod(p_fld%grid%nx, 64) .ne. 0) THEN CALL check_status("Global size is not a multiple of local size \ (mandatory in OpenCL < 2.0).", -1) END IF''' assert expected in generated_code # Call the set_args subroutine with the boundaries corrected for the # OpenCL 0-indexing expected += ''' CALL compute_cu_code_set_args(kernel_compute_cu_code, \ cu_fld%device_ptr, p_fld%device_ptr, u_fld%device_ptr, \ xstart - 1, xstop - 1, \ ystart - 1, ystop - 1)''' expected += ''' ! Launch the kernel''' if debug_mode: # Check that there is no pending error in the queue before launching # the kernel expected += ''' ierr = clFinish(cmd_queues(1)) CALL check_status('Errors before compute_cu_code launch', ierr)''' expected += ''' ierr = clEnqueueNDRangeKernel(cmd_queues(1), kernel_compute_cu_code, \ 2, C_NULL_PTR, C_LOC(globalsize), C_LOC(localsize), 0, C_NULL_PTR, \ C_NULL_PTR) !''' if debug_mode: # Check that there are no errors during the kernel launch or during # the execution of the kernel. expected += ''' CALL check_status('compute_cu_code clEnqueueNDRangeKernel', ierr) ierr = clFinish(cmd_queues(1)) CALL check_status('Errors during compute_cu_code', ierr)''' assert expected in generated_code assert GOcean1p0OpenCLBuild(kernel_outputdir).code_compiles(psy)