示例#1
0
    def buildDeclarations(self, maxLoopA, maxLoopB, reduction_node_list, shared_node_list, ast):
        """ Builds the declaration section 
             @param numThreads number of threads
             @return Declarations subtree
        """ 
        # Position in the template for dimA declaration, just in case we change it
        DIMA_POS = 0
        DIMB_POS = 1
        MEMSIZE_POS = 4
        # TODO : Move this array creation to a template filter (something like |type)
        reduction_vars = get_template_array(reduction_node_list, ast)
        def check_array(elem):
            return isinstance(elem.type, c_ast.ArrayDecl) or isinstance(elem, c_ast.Struct)
        shared_vars = get_template_array(shared_node_list, ast, func = check_array) 

        template_code = """
         /* Kernel configuration */
         void kernel_func() {
                  int dimA = 1;
                  int dimB = 1;

                  int numThreadsPerBlock = CUDA_NUM_THREADS/2;
                  int numBlocks = dimA / numThreadsPerBlock + (dimA % numThreadsPerBlock?1:0);

                  int numThreadsPerBlockB = CUDA_NUM_THREADS/2;
                  int numBlocksB = dimB / numThreadsPerBlock + (dimB % numThreadsPerBlock?1:0);


                  int numElems = numBlocks * numThreadsPerBlock * numBlocksB * numThreadsPerBlockB;
                  int memSize = numElems * sizeof(double);


                  /* Variable declaration */
                  % for var in reduction_names:
                        ${var.type} * reduction_cu_${var.name};
                  % endfor

                  % for var in shared_vars:
                        ${var.type} * ${var}_cu; 
                  % endfor
                  /* Initialization */
                  % for var in reduction_names:
                    cudaMalloc((void **) (&reduction_cu_${var.name}), numElems * sizeof(${var.type}));
                    /* This may be incorrect in case reduction don't start with 0 or 1 */
                    cudaMemset(reduction_cu_${var.name}, (int) ${var.name}, numElems * sizeof(${var.type}));
                  % endfor

                  % for var in shared_vars:
                  ${var.name}_cu = malloc(numElems * sizeof(${var.type}));
                  cudaMalloc((void **) (&${var.name}_cu), numElems * sizeof(${var.type}));
                  cudaMemcpy(${var.name}_cu, ${var.name}, numElems * sizeof(${var.type}), cudaMemcpyHostToDevice); 
                  % endfor
            }

            """
        kernel_init = self.parse_snippet(template_code, {'reduction_names' : reduction_vars, 'shared_vars' : shared_vars}, name = 'Initialization of ' + self.kernel_name, show = False).ext[-1].body
        kernel_init.decls[DIMA_POS].init = maxLoopA
        kernel_init.decls[DIMB_POS].init = maxLoopB
        return kernel_init
示例#2
0
    def buildParallelDeclarations(self, shared_node_list, ast):
        """ Builds the declaration section of a Parallel Region
                This code handles memory transfers between host and cuda
              
             :param shared_node_list: List of shared variable declarations
             :param ast: Original ast (for type search)

             :return: Parallel Declarations subtree
        """ 
        # Position in the template for dimA declaration, just in case we change it
        tmp = []
        for elem in shared_node_list: 
            if isinstance(elem.type, c_ast.ArrayDecl) or isinstance(elem.type,c_ast.Struct):
                tmp.append(elem)

        shared_vars = get_template_array(tmp, ast) 

        # Type string | var name | pointer to type | pointer to var | declaration string
        template_code = """
            int main() {
                 % for var in shared_vars:
                        ${var.type} * ${var.name}_cu;
                  % endfor
                  % for var in shared_vars:
                  cudaMalloc((void **) (&${var.name}_cu), ${var.numelems} * sizeof(${var.type}));
                  cudaMemcpy(${var.name}_cu, ${var.name}, ${var.numelems} * sizeof(${var.type}), cudaMemcpyHostToDevice); 
                  % endfor
            }

            """
        print "New kernel build with name : " + self.kernel_name
        parallel_init = self.parse_snippet(template_code, {'shared_vars' : shared_vars}, name = 'Initialization of Parallel Region ' + self.kernel_name, show = False).ext[-1].body
#~     from Tools.Debug import DotDebugTool
#~     DotDebugTool().apply(kernel_init)
        return parallel_init
示例#3
0
    def buildKernelLaunch(self, reduction_vars, shared_vars, ast):
        # TODO: Move this to some kind of template function
        def decls_to_param(elem):
            if isinstance(elem.type, c_ast.ArrayDecl):
                return elem.name + "_cu"
            return elem.name

        shared_vars = get_template_array(shared_vars, ast, name_func = decls_to_param) 

        template_code = """
        #include "llcomp_cuda.h" 
        
         int fake() {
                  dim3 dimGrid (numBlocks, numBlocksB);
                    dim3 dimBlock (numThreadsPerBlock, numThreadsPerBlockB);

             ${kernelName} <<< dimGrid, dimBlock >>> (${', '.join("reduction_cu_" + var.name for var in reduction_vars)}
                  %if len(reduction_vars) > 0 and len(shared_vars) > 0:
                        ,
                  %endif 
                  ${', '.join( var.name for var in shared_vars)});
         }
        """
        # The last element is the object function
        tree = [ elem for elem in self.parse_snippet(template_code, {'reduction_vars' : reduction_vars, 'shared_vars' : shared_vars,  'kernelName' : self.kernel_name}, name = 'KernelLaunch', show = False).ext  if type(elem) == c_ast.FuncDef  ][-1].body
        return tree
示例#4
0
文件: Common.py 项目: zhiichen/llcomp
    def buildHostReduction(self, reduction_vars, ast):
        """ Instanciate the reduction pattern 
 
           :param reduction_vars: Vars in the reduction clause
           :return: Compound with the reduction code
        """
        if len(reduction_vars) == 0:
            return c_ast.Compound(stmts = [], decls = [])

        template_code = """
        int fake() {
        #define LLC_REDUCTION_FUNC(dest, fuente) dest = dest + fuente 
        % for var in reduction_vars:
            ${var} = kernelReduction_${var.type}(reduction_cu_${var.name}, numElems, ${var.name});
        % endfor

        /* By default, omp for has a wait at the end */
        % if not nowait:
            cudaThreadSynchronize();
        % endif

        % for var in reduction_vars:
        cudaFree(reduction_cu_${var.name});
        % endfor
        }
        """
        return self.parse_snippet(template_code, {'reduction_vars' : get_template_array(reduction_vars, ast), 'nowait' : False}, name = 'HostReduction').ext[0].body
示例#5
0
文件: Common.py 项目: zhiichen/llcomp
    def buildRetrieve(self, reduction_vars, modified_shared_vars, ast = None, shared_vars = None):
        memcpy_lines = []
        reduction_vars = get_template_array(reduction_vars, ast)
        shared_vars     = get_template_array(modified_shared_vars, ast)

        # Template source
        template_code = """
        int fake() {
        % for var in reduction_vars:
          cudaMemcpy(reduction_loc_${var.name}, reduction_cu_${var.name}, memSize, cudaMemcpyDeviceToHost);
        % endfor

        % for var in shared_vars:
          cudaMemcpy(${var.name}, ${var.name}_cu, sizeof(${var.type}) * ${var.numelems}, cudaMemcpyDeviceToHost);
        % endfor

        checkCUDAError("memcpy");

        % for var in shared_vars:
        /*  cudaFree(${var.name}_cu);*/
        % endfor
        }
        """ 
        return self.parse_snippet(template_code, {'reduction_vars' : reduction_vars, 'shared_vars' : shared_vars}, name = 'Retrieve', show = False).ext[0].body
示例#6
0
    def buildKernel(self, shared_list, private_list, reduction_list, loop, ast):
        """ Build CUDA Kernel code """

        reduction_vars = get_template_array(reduction_list, ast)

        # only for inside for
        loop_list = [loop.init.lvalue.name, loop.stmt.stmts[0].init.lvalue.name]
        private_vars = get_template_array([var for var in private_list if not var.name in loop_list], ast)

        loop_vars = get_template_array([var for var in private_list if var.name in loop_list], ast)

        # Retrieve list of shared vars and build the array to template parsing
        # TODO: Move this to some kind of template function
        def decls_to_param(elem):
            if isinstance(elem.type, c_ast.ArrayDecl):
                return "*" + elem.name + "_cu"
            return elem.name

        shared_vars = get_template_array(shared_list, ast, name_func = decls_to_param) 

        clause_vars = []
        clause_vars.extend(shared_vars)
        clause_vars.extend(private_vars)
        typedef_list = get_typedefs_to_template(clause_vars,ast)


        template_code = """

            #include "llcomp_cuda.h" 
            %for line in typedefs:
                ${line}
            %endfor

             __global__ void ${kernelName} ( 
                  ${', '.join(str(var.type) + " * reduction_cu_" + str(var.name) for var in reduction_vars)}
                  %if len(reduction_vars) > 0 and len(shared_vars) > 0:
                        ,
                  %endif 
                  ${', '.join(str(var.type) + " " + str(var.name) for var in shared_vars)}
            )
             {
             int idx = blockIdx.x * blockDim.x + threadIdx.x;
             int idy = blockIdx.y * blockDim.y + threadIdx.y;

            ${loop_vars[0].declaration} ${loop_vars[0]} = idx;
            ${loop_vars[1].declaration} ${loop_vars[1]} = idy;

             %for var in private_vars:
                ${var.declaration} ${var};
             %endfor
             ;
             }
             """
        tree = self.parse_snippet(template_code, {'kernelName' : self.kernel_name, 'reduction_vars' : reduction_vars, 'shared_vars' : shared_vars, 'typedefs' : typedef_list, 'private_vars' : private_vars, 'loop_vars' : loop_vars} , name = 'KernelBuild', show = False)

        # OpenMP shared vars are parameters of the kernel function
        if shared_list:
          for elem in shared_list:
                # Replace the name of the declaration in the kernel code. 
                if isinstance(elem.type, c_ast.ArrayDecl) or isinstance(elem.type, c_ast.Struct):
                    mut = IDNameMutator(old = c_ast.ID(elem.name), new = c_ast.ID(elem.name + '_cu'))
                    mut.apply_all(loop.stmt)

        DeclsToParamsMutator().apply(tree.ext[-1].function.decl.type.args)
        # OpenMP Private vars need to be declared inside kernel
        #     - we build a tmp Compound to group all declarations, and insert them once
        tmp = c_ast.Compound(decls = [], stmts=[])
        #     - Insert tool removes the parent node of the inserted subtree
        InsertTool(subtree = tmp, position = "end").apply(tree.ext[-1].function.body, 'decls')

        # Identify function calls inside kernel and replace the definitions to __device__ 
        try:
            for func_call in FuncCallFilter().iterate(loop.stmt):
              print " Writing " + func_call.name.name + " to device "
              try:
                  fcm = FuncToDeviceMutator(func_call = func_call).apply(ast)
              except IgnoreMutationException as ime:
                  # This function is already implemented on device, so we continue we don't need to convert it
                  print "CudaMutator:: Warning :: " + str(ime)
        except NodeNotFound:
            # There are not function calls on the loop.stmt
            pass
        except FilterError as fe:
            print " Filter error "
            raise CudaMutatorError(fe.get_description())
        # Identify function calls inside kernel and replace the definitions to __device__ 
        # TODO: This is incorrect, we should write a subtree instead of a bare string...
        for elem in reduction_list:
            IDNameMutator(old = c_ast.ID(name = elem.name, parent = elem.parent), new = c_ast.ID(name = 'reduction_cu_' + str(elem.name) + '[idx]', parent = elem.parent)).apply_all(loop.stmt)
        # Insert the code inside kernel
        # We need to check if the idx is inside for limits (in case we have more threads than iterations)
        # TODO change this to generic form
        merge_condition_node = c_ast.BinaryOp(op = '&&', left = loop.cond, right = loop.stmt.stmts[0].cond, parent = None)
        # TODO change this to generic form
        check_boundary_node = c_ast.Compound(decls = None, stmts = [c_ast.If(cond = merge_condition_node, iftrue = loop.stmt.stmts[0].stmt, iffalse = None)], parent = tree.ext[-1].function.body)
        # Preserve parent node
        merge_condition_node.parent = check_boundary_node;
        assert check_boundary_node.stmts[0].cond.parent == check_boundary_node

        InsertTool(subtree = check_boundary_node, position = "begin").apply(tree.ext[-1].function.body, 'stmts')
        return c_ast.FileAST(ext = [tree.ext[-1]])