Пример #1
0
def op2_gen_mpi_vec(master, date, consts, kernels):

    global dims, idxs, typs, indtyps, inddims
    global FORTRAN, CPP, g_m, file_text, depth

    OP_ID = 1
    OP_GBL = 2
    OP_MAP = 3

    OP_READ = 1
    OP_WRITE = 2
    OP_RW = 3
    OP_INC = 4
    OP_MAX = 5
    OP_MIN = 6

    accsstring = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', 'OP_MAX', 'OP_MIN']

    any_soa = 0
    for nk in range(0, len(kernels)):
        any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

    for nk in range(0, len(kernels)):

        name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
                ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
                unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])
        #
        # set three logicals
        #
        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_MAP and accs[i] == OP_INC:
                j = i
        ind_inc = j >= 0

        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_GBL and accs[i] <> OP_READ:
                j = i
        reduct = j >= 0

        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_MAP:
                j = i
        indirect_kernel = j >= 0

        ####################################################################################
        #  generate the user kernel function - creating versions for vectorisation as needed
        ####################################################################################

        FORTRAN = 0
        CPP = 1
        g_m = 0
        file_text = ''
        depth = 0

        #
        # First original version
        #
        comm('user function')
        file_name = decl_filepath

        f = open(file_name, 'r')
        kernel_text = f.read()
        file_text += kernel_text
        f.close()

        #
        # Modified vectorisable version if its an indirect kernel
        # - direct kernels can be vectorised without modification
        #
        if indirect_kernel:
            code('#ifdef VECTORIZE')
            comm('user function -- modified for vectorisation')
            f = open(file_name, 'r')
            kernel_text = f.read()
            f.close()

            kernel_text = op2_gen_common.comment_remover(kernel_text)
            kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

            p = re.compile('void\\s+\\b' + name + '\\b')
            i = p.search(kernel_text).start()

            if (i < 0):
                print "\n********"
                print "Error: cannot locate user kernel function name: " + name + " - Aborting code generation"
                exit(2)
            i2 = i

            #i = kernel_text[0:i].rfind('\n') #reverse find
            j = kernel_text[i:].find('{')
            k = op2_gen_common.para_parse(kernel_text, i + j, '{', '}')
            signature_text = kernel_text[i:i + j]
            l = signature_text[0:].find('(')
            head_text = signature_text[0:l]  #save function name
            m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
            signature_text = signature_text[l + 1:m]
            body_text = kernel_text[i + j + 1:k]

            ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>:
            body_text = op2_gen_common.replace_local_includes_with_file_contents(
                body_text, os.path.dirname(master))

            # check for number of arguments
            nargs_actual = len(signature_text.split(','))
            if nargs_actual != nargs:
                print(
                    'Error parsing user kernel({0}): must have {1} arguments (instead it has {2})'
                    .format(name, nargs, nargs_actual))
                return

            new_signature_text = ''
            for i in range(0, nargs):
                var = signature_text.split(',')[i].strip()

                if maps[i] <> OP_GBL and maps[i] <> OP_ID:
                    #remove * and add [*][SIMD_VEC]
                    var = var.replace('*', '')
                    #locate var in body and replace by adding [idx]
                    length = len(re.compile('\\s+\\b').split(var))
                    var2 = re.compile('\\s+\\b').split(var)[length - 1].strip()

                    #print var2

                    body_text = re.sub('\*\\b' + var2 + '\\b\\s*(?!\[)',
                                       var2 + '[0]', body_text)
                    body_text = re.sub(
                        r'(' + var2 + '\[[\w\(\)\+\-\*\s\\\\]*\]' + ')',
                        r'\1' + '[idx]', body_text)

                    var = var + '[*][SIMD_VEC]'
                    #var = var + '[restrict][SIMD_VEC]'
                new_signature_text += var + ', '

            #add ( , idx and )
            signature_text = "inline " + head_text + '( ' + new_signature_text + 'int idx ) {'
            #finally update name
            signature_text = signature_text.replace(name, name + '_vec')

            #print head_text
            #print signature_text
            #print  body_text

            file_text += signature_text + body_text + '}\n'
            code('#endif')

##########################################################################
# then C++ stub function
##########################################################################

        code('')
        comm(' host stub function')
        code('void op_par_loop_' + name + '(char const *name, op_set set,')
        depth += 2

        for m in unique_args:
            g_m = m - 1
            if m == unique_args[len(unique_args) - 1]:
                code('op_arg <ARG>){')
                code('')
            else:
                code('op_arg <ARG>,')

        code('int nargs = ' + str(nargs) + ';')
        code('op_arg args[' + str(nargs) + '];')
        code('')

        for g_m in range(0, nargs):
            u = [
                i for i in range(0, len(unique_args))
                if unique_args[i] - 1 == g_m
            ]
            if len(u) > 0 and vectorised[g_m] > 0:
                code('<ARG>.idx = 0;')
                code('args[' + str(g_m) + '] = <ARG>;')

                v = [
                    int(vectorised[i] == vectorised[g_m])
                    for i in range(0, len(vectorised))
                ]
                first = [i for i in range(0, len(v)) if v[i] == 1]
                first = first[0]
                if (optflags[g_m] == 1):
                    argtyp = 'op_opt_arg_dat(arg' + str(first) + '.opt, '
                else:
                    argtyp = 'op_arg_dat('

                FOR('v', '1', str(sum(v)))
                code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
                str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
                ENDFOR()
                code('')
            elif vectorised[g_m] > 0:
                pass
            else:
                code('args[' + str(g_m) + '] = <ARG>;')

#
# create aligned pointers
#
        comm('create aligned pointers for dats')
        for g_m in range(0, nargs):
            if maps[g_m] <> OP_GBL:
                if (accs[g_m] == OP_INC or accs[g_m] == OP_RW
                        or accs[g_m] == OP_WRITE):
                    code('ALIGNED_<TYP>       <TYP> * __restrict__ ptr'+\
                    str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
                    #code('<TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN)))  ptr'+\
                    #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
                    code('__assume_aligned(ptr' + str(g_m) + ',<TYP>_ALIGN);')

                else:
                    code('ALIGNED_<TYP> const <TYP> * __restrict__ ptr'+\
                    str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
                    code('__assume_aligned(ptr' + str(g_m) + ',<TYP>_ALIGN);')
                    #code('const <TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN)))  ptr'+\
                    #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')

#
# start timing
#
        code('')
        comm(' initialise timers')
        code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
        code('op_timing_realloc(' + str(nk) + ');')
        code('op_timers_core(&cpu_t1, &wall_t1);')
        code('')

        #
        #   indirect bits
        #
        if ninds > 0:
            IF('OP_diags>2')
            code('printf(" kernel routine with indirection: ' + name +
                 '\\n");')
            ENDIF()

#
# direct bit
#
        else:
            code('')
            IF('OP_diags>2')
            code('printf(" kernel routine w/o indirection:  ' + name + '");')
            ENDIF()

        code('')
        code('int exec_size = op_mpi_halo_exchanges(set, nargs, args);')

        code('')
        IF('exec_size >0')
        code('')

        #
        # kernel call for indirect version
        #
        if ninds > 0:
            code('#ifdef VECTORIZE')

            code('#pragma novector')
            FOR2('n', '0', '(exec_size/SIMD_VEC)*SIMD_VEC', 'SIMD_VEC')
            #initialize globals
            for g_m in range(0, nargs):
                if maps[g_m] == OP_GBL:
                    code('<TYP> dat{0}[SIMD_VEC];'.format(g_m))
                    FOR('i', '0', 'SIMD_VEC')
                    if accs[g_m] == OP_INC:
                        code('dat{0}[i] = 0.0;'.format(g_m))
                    elif accs[g_m] == OP_MAX:
                        code('dat{0}[i] = -INFINITY;'.format(g_m))
                    elif accs[g_m] == OP_MIN:
                        code('dat{0}[i] = INFINITY;'.format(g_m))
                    elif accs[g_m] == OP_READ:
                        code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m))
                    ENDFOR()

            IF('n+SIMD_VEC >= set->core_size')
            code('op_mpi_wait_all(nargs, args);')
            ENDIF()
            for g_m in range(0, nargs):
                if maps[g_m] == OP_MAP and (accs[g_m] == OP_READ \
                  or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE \
                  or accs[g_m] == OP_INC):
                    code('ALIGNED_<TYP> <TYP> dat' + str(g_m) +
                         '[<DIM>][SIMD_VEC];')

            #setup gathers
            code('#pragma omp simd simdlen(SIMD_VEC)')
            FOR('i', '0', 'SIMD_VEC')
            if nmaps > 0:
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP:
                        if (accs[g_m] == OP_READ or accs[g_m] == OP_RW
                                or accs[g_m]
                                == OP_WRITE):  #and (not mapinds[g_m] in k):
                            code('int idx' + str(g_m) +
                                 '_<DIM> = <DIM> * arg' +
                                 str(invmapinds[inds[g_m] - 1]) +
                                 '.map_data[(n+i) * arg' +
                                 str(invmapinds[inds[g_m] - 1]) +
                                 '.map->dim + ' + str(idxs[g_m]) + '];')
            code('')
            for g_m in range(0, nargs):
                if maps[g_m] == OP_MAP:
                    if (accs[g_m] == OP_READ or accs[g_m]
                            == OP_RW):  #and (not mapinds[g_m] in k):
                        for d in range(0, int(dims[g_m])):
                            code('dat' + str(g_m) + '[' + str(d) +
                                 '][i] = (ptr' + str(g_m) + ')[idx' +
                                 str(g_m) + '_<DIM> + ' + str(d) + '];')
                        code('')
                    elif (accs[g_m] == OP_INC):
                        for d in range(0, int(dims[g_m])):
                            code('dat' + str(g_m) + '[' + str(d) +
                                 '][i] = 0.0;')
                        code('')
                else:  #globals
                    if (accs[g_m] == OP_INC):
                        # for d in range(0,int(dims[g_m])):
                        #   code('dat'+str(g_m)+'[i] = 0.0;')
                        # code('')
                        pass

            ENDFOR()
            #kernel call
            code('#pragma omp simd simdlen(SIMD_VEC)')
            FOR('i', '0', 'SIMD_VEC')
            line = name + '_vec('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * (n+i)],'
                elif maps[g_m] == OP_GBL and accs[g_m] == OP_READ:
                    line = line + indent + '(' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data,'
                elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC:
                    line = line + indent + '&dat' + str(g_m) + '[i],'
                else:
                    line = line + indent + 'dat' + str(g_m) + ','
            line = line + indent + 'i);'
            code(line)
            ENDFOR()
            #do the scatters
            FOR('i', '0', 'SIMD_VEC')
            if nmaps > 0:
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP:
                        if (accs[g_m] == OP_INC or accs[g_m] == OP_RW
                                or accs[g_m]
                                == OP_WRITE):  #and (not mapinds[g_m] in k):
                            code('int idx' + str(g_m) +
                                 '_<DIM> = <DIM> * arg' +
                                 str(invmapinds[inds[g_m] - 1]) +
                                 '.map_data[(n+i) * arg' +
                                 str(invmapinds[inds[g_m] - 1]) +
                                 '.map->dim + ' + str(idxs[g_m]) + '];')
            code('')
            for g_m in range(0, nargs):
                if maps[g_m] == OP_MAP:
                    if (accs[g_m] == OP_INC):
                        for d in range(0, int(dims[g_m])):
                            code('(ptr' + str(g_m) + ')[idx' + str(g_m) +
                                 '_<DIM> + ' + str(d) + '] += dat' + str(g_m) +
                                 '[' + str(d) + '][i];')
                        code('')
                    if (accs[g_m] == OP_WRITE or accs[g_m] == OP_RW):
                        for d in range(0, int(dims[g_m])):
                            code('(ptr' + str(g_m) + ')[idx' + str(g_m) +
                                 '_<DIM> + ' + str(d) + '] = dat' + str(g_m) +
                                 '[' + str(d) + '][i];')
                        code('')
            ENDFOR()

            #do reductions
            for g_m in range(0, nargs):
                if maps[g_m] == OP_GBL:
                    FOR('i', '0', 'SIMD_VEC')
                    if accs[g_m] == OP_INC:
                        code('*(<TYP>*)arg' + str(g_m) + '.data += dat' +
                             str(g_m) + '[i];')
                    elif accs[g_m] == OP_MAX:
                        code('*(<TYP>*)arg' + str(g_m) +
                             '.data = MAX(*(<TYP>*)arg' + str(g_m) +
                             '.data,dat' + str(g_m) + '[i]);')
                    elif accs[g_m] == OP_MIN:
                        code('*(<TYP>*)arg' + str(g_m) +
                             '.data = MIN(*(<TYP>*)arg' + str(g_m) +
                             '.data,dat' + str(g_m) + '[i]);')
                    ENDFOR()

            ENDFOR()
            code('')
            comm('remainder')
            FOR('n', '(exec_size/SIMD_VEC)*SIMD_VEC', 'exec_size')
            depth = depth - 2
            code('#else')
            FOR('n', '0', 'exec_size')
            depth = depth - 2
            code('#endif')
            depth = depth + 2
            IF('n==set->core_size')
            code('op_mpi_wait_all(nargs, args);')
            ENDIF()
            if nmaps > 0:
                k = []
                #print name
                #print maps
                #print mapinds
                for g_m in range(0, nargs):
                    #print g_m
                    if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
                        k = k + [mapinds[g_m]]
                        code('int map' + str(mapinds[g_m]) + 'idx = arg' +
                             str(invmapinds[inds[g_m] - 1]) +
                             '.map_data[n * arg' +
                             str(invmapinds[inds[g_m] - 1]) + '.map->dim + ' +
                             str(idxs[g_m]) + '];')
            code('')
            line = name + '('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * n]'
                if maps[g_m] == OP_MAP:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * map' + str(mapinds[g_m]) + 'idx]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '(' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data'
                if g_m < nargs - 1:
                    line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()

#
# kernel call for direct version
#
        else:
            code('#ifdef VECTORIZE')

            code('#pragma novector')
            FOR2('n', '0', '(exec_size/SIMD_VEC)*SIMD_VEC', 'SIMD_VEC')

            #initialize globals
            for g_m in range(0, nargs):
                if maps[g_m] == OP_GBL:
                    code('<TYP> dat{0}[SIMD_VEC];'.format(g_m))
                    FOR('i', '0', 'SIMD_VEC')
                    if accs[g_m] == OP_INC:
                        code('dat{0}[i] = 0.0;'.format(g_m))
                    elif accs[g_m] == OP_MAX:
                        code('dat{0}[i] = -INFINITY;'.format(g_m))
                    elif accs[g_m] == OP_MIN:
                        code('dat{0}[i] = INFINITY;'.format(g_m))
                    elif accs[g_m] == OP_READ:
                        code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m))
                    ENDFOR()

            code('#pragma omp simd simdlen(SIMD_VEC)')
            FOR('i', '0', 'SIMD_VEC')
            line = name + '('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * (n+i)]'
                if maps[g_m] == OP_MAP:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * map' + str(mapinds[g_m]) + 'idx]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '&dat' + str(g_m) + '[i]'
                if g_m < nargs - 1:
                    line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()
            #do reductions
            for g_m in range(0, nargs):
                if maps[g_m] == OP_GBL:
                    FOR('i', '0', 'SIMD_VEC')
                    if accs[g_m] == OP_INC:
                        code('*(<TYP>*)arg' + str(g_m) + '.data += dat' +
                             str(g_m) + '[i];')
                    elif accs[g_m] == OP_MAX:
                        code('*(<TYP>*)arg' + str(g_m) +
                             '.data = MAX(*(<TYP>*)arg' + str(g_m) +
                             '.data,dat' + str(g_m) + '[i]);')
                    elif accs[g_m] == OP_MIN:
                        code('*(<TYP>*)arg' + str(g_m) +
                             '.data = MIN(*(<TYP>*)arg' + str(g_m) +
                             '.data,dat' + str(g_m) + '[i]);')
                    ENDFOR()
            ENDFOR()

            comm('remainder')
            FOR('n', '(exec_size/SIMD_VEC)*SIMD_VEC', 'exec_size')
            depth = depth - 2
            code('#else')
            FOR('n', '0', 'exec_size')
            depth = depth - 2
            code('#endif')
            depth = depth + 2
            line = name + '('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + '*n]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '(' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data'
                if g_m < nargs - 1:
                    line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()
        ENDIF()
        code('')

        #zero set size issues
        if ninds > 0:
            IF('exec_size == 0 || exec_size == set->core_size')
            code('op_mpi_wait_all(nargs, args);')
            ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
        comm(' combine reduction data')
        for g_m in range(0, nargs):
            if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
                code('op_mpi_reduce(&<ARG>,(' + typs[g_m] + '*)<ARG>.data);')

        code('op_mpi_set_dirtybit(nargs, args);')
        code('')

        #
        # update kernel record
        #

        comm(' update kernel record')
        code('op_timers_core(&cpu_t2, &wall_t2);')
        code('OP_kernels[' + str(nk) + '].name      = name;')
        code('OP_kernels[' + str(nk) + '].count    += 1;')
        code('OP_kernels[' + str(nk) + '].time     += wall_t2 - wall_t1;')

        if ninds == 0:
            line = 'OP_kernels[' + str(nk) + '].transfer += (float)set->size *'

            for g_m in range(0, nargs):
                if maps[g_m] <> OP_GBL:
                    if accs[g_m] == OP_READ:
                        code(line + ' <ARG>.size;')
                    else:
                        code(line + ' <ARG>.size * 2.0f;')
        else:
            names = []
            for g_m in range(0, ninds):
                mult = ''
                if indaccs[g_m] <> OP_WRITE and indaccs[g_m] <> OP_READ:
                    mult = ' * 2.0f'
                if not var[invinds[g_m]] in names:
                    code('OP_kernels[' + str(nk) +
                         '].transfer += (float)set->size * arg' +
                         str(invinds[g_m]) + '.size' + mult + ';')
                    names = names + [var[invinds[g_m]]]
            for g_m in range(0, nargs):
                mult = ''
                if accs[g_m] <> OP_WRITE and accs[g_m] <> OP_READ:
                    mult = ' * 2.0f'
                if not var[g_m] in names:
                    names = names + [var[invinds[g_m]]]
                    if maps[g_m] == OP_ID:
                        code('OP_kernels[' + str(nk) +
                             '].transfer += (float)set->size * arg' +
                             str(g_m) + '.size' + mult + ';')
                    elif maps[g_m] == OP_GBL:
                        code('OP_kernels[' + str(nk) +
                             '].transfer += (float)set->size * arg' +
                             str(g_m) + '.size' + mult + ';')
            if nmaps > 0:
                k = []
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
                        k = k + [mapnames[g_m]]
                        code('OP_kernels[' + str(nk) +
                             '].transfer += (float)set->size * arg' +
                             str(invinds[inds[g_m] - 1]) + '.map->dim * 4.0f;')

        depth -= 2
        code('}')

        ##########################################################################
        #  output individual kernel file
        ##########################################################################
        if not os.path.exists('vec'):
            os.makedirs('vec')
        fid = open('vec/' + name + '_veckernel.cpp', 'w')
        date = datetime.datetime.now()
        #fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n')
        fid.write('//\n// auto-generated by op2.py\n//\n\n')
        fid.write(file_text)
        fid.close()


# end of main kernel call loop

##########################################################################
#  output one master kernel file
##########################################################################

    file_text = ''

    code('#define double_ALIGN 128')
    code('#define float_ALIGN 64')
    code('#define int_ALIGN 64')
    code('#ifdef VECTORIZE')
    code('#define SIMD_VEC 4')
    code('#define ALIGNED_double __attribute__((aligned(double_ALIGN)))')
    code('#define ALIGNED_float __attribute__((aligned(float_ALIGN)))')
    code('#define ALIGNED_int __attribute__((aligned(int_ALIGN)))')
    code('#else')
    code('#define ALIGNED_double')
    code('#define ALIGNED_float')
    code('#define ALIGNED_int')
    code('#endif')
    code('')

    comm(' global constants       ')

    for nc in range(0, len(consts)):
        if not consts[nc]['user_declared']:
            if consts[nc]['dim'] == 1:
                code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                     consts[nc]['name'] + ';')
            else:
                if consts[nc]['dim'] > 0:
                    num = str(consts[nc]['dim'])
                else:
                    num = 'MAX_CONST_SIZE'
                code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                     consts[nc]['name'] + '[' + num + '];')
    code('')

    comm(' header                 ')

    if os.path.exists('./user_types.h'):
        code('#include "../user_types.h"')
    code('#include "op_lib_cpp.h"')
    code('')

    comm(' user kernel files')

    for nk in range(0, len(kernels)):
        code('#include "' + kernels[nk]['name'] + '_veckernel.cpp"')
    master = master.split('.')[0]
    fid = open('vec/' + master.split('.')[0] + '_veckernels.cpp', 'w')
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()
Пример #2
0
def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  inc_stage=0
  op_color2=0
  op_color2_force=0
##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
           ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
           unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk], inc_stage)

    any_soa = 0
    any_soa = any_soa or sum(soaflags)
#
# set logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_RW:
        j = i
    ind_rw = j >= 0

    if ind_rw or op_color2_force:
        op_color2 = 1
    else:
        op_color2 = 0

    #no staging with 2 level colouring
    if op_color2:
      inc_stage=0

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
        j = i
    reduct = j >= 0

    if inc_stage:
      ninds_staged = 0
      inds_staged = [-1]*nargs
      for i in range(0,nargs):
        if maps[i]==OP_MAP and accs[i]==OP_INC:
          if inds_staged[invinds[inds[i]-1]] == -1:
            inds_staged[i] = ninds_staged
            ninds_staged = ninds_staged + 1
          else:
            inds_staged[i] = inds_staged[invinds[inds[i]-1]]
      invinds_staged = [-1]*ninds_staged
      inddims_staged = [-1]*ninds_staged
      indopts_staged = [-1]*ninds_staged
      for i in range(0,nargs):
        if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1:
          invinds_staged[inds_staged[i]] = i
          inddims_staged[inds_staged[i]] = dims[i]
          if optflags[i] == 1:
            indopts_staged[inds_staged[i]] = i
      for i in range(0,nargs):
        inds_staged[i] = inds_staged[i] + 1

##########################################################################
#  start with CUDA kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0


    #strides for SoA
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('__constant__ int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;')
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;')
      dir_soa = -1
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1):
          code('__constant__ int direct_'+name+'_stride_OP2CONSTANT;')
          code('int direct_'+name+'_stride_OP2HOST=-1;')
          dir_soa = g_m
          break

    file_name = decl_filepath

    f = open(file_name, 'r')
    kernel_text = f.read()
    f.close()

    if CPP:
      includes = op2_gen_common.extract_includes(kernel_text)
      if len(includes) > 0:
        for include in includes:
          code(include)
        code("")

    comm('user function')
    
    kernel_text = op2_gen_common.comment_remover(kernel_text)
    kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

    p = re.compile('void\\s+\\b'+name+'\\b')
    i = p.search(kernel_text).start()

    if(i < 0):
      print "\n********"
      print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation"
      exit(2)
    i2 = i

    #i = kernel_text[0:i].rfind('\n') #reverse find
    j = kernel_text[i:].find('{')
    k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}')
    signature_text = kernel_text[i:i+j]
    l = signature_text[0:].find('(')
    head_text = signature_text[0:l].strip() #save function name
    m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
    signature_text = signature_text[l+1:m]
    body_text = kernel_text[i+j+1:k]

    ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>:
    body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master))

    # check for number of arguments
    if len(signature_text.split(',')) != nargs_novec:
        print 'Error parsing user kernel('+name+'): must have '+str(nargs)+' arguments'
        return

    for i in range(0,nargs_novec):
        var = signature_text.split(',')[i].strip()
        if kernels[nk]['soaflags'][i] and (op_color2 or  not (kernels[nk]['maps'][i] == OP_MAP and kernels[nk]['accs'][i] == OP_INC)):
          var = var.replace('*','')
          #locate var in body and replace by adding [idx]
          length = len(re.compile('\\s+\\b').split(var))
          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()

          if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
            body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
          else:
            body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
            body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \
                               op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)

    for nc in range(0,len(consts)): 
      varname = consts[nc]['name']
      body_text = re.sub('\\b'+varname+'\\b', varname+'_cuda',body_text)

    signature_text = '__device__ '+head_text + '_gpu( '+signature_text + ') {'
    file_text += signature_text + body_text + '}\n'

    comm('')
    comm(' CUDA kernel function')

    if FORTRAN:
      code('subroutine op_cuda_'+name+'(')
    elif CPP:
      code('__global__ void op_cuda_'+name+'(')

    depth = 2

    if nopts > 0:
      code('int optflags,')
      
    for g_m in range(0,ninds):
      if (indaccs[g_m]==OP_READ):
        code('const <INDTYP> *__restrict <INDARG>,')
      else:
        code('<INDTYP> *__restrict <INDARG>,')

    if nmaps > 0:
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
          k = k + [mapnames[g_m]]
          code('const int *__restrict opDat'+str(invinds[inds[g_m]-1])+'Map, ')



    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        if accs[g_m] == OP_READ:
          code('const <TYP> *__restrict <ARG>,')
        else:
          code('<TYP> *<ARG>,')
      elif maps[g_m] == OP_GBL:
        if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE:
          code('<TYP> *<ARG>,')
        elif accs[g_m] == OP_READ:
          code('const <TYP> *<ARG>,')

    if ind_inc and inc_stage==1:
      code('int   *ind_map,')
      code('short *arg_map,')
      code('int   *ind_arg_sizes,')
      code('int   *ind_arg_offs, ')

    if ninds>0:
      if not op_color2:
        code('int    block_offset, ')
        code('int   *blkmap,       ')
        code('int   *offset,       ')
        code('int   *nelems,       ')
        code('int   *ncolors,      ')
        code('int   *colors,       ')
        code('int   nblocks,       ')
      else:
        code('int start,           ')
        code('int end,             ')
        code('int *col_reord,      ')
      code('int   set_size) {    ')
    else:
      code('int   set_size ) {')
      code('')


    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE:
        code('<TYP> <ARG>_l[<DIM>];')
        if accs[g_m] == OP_INC:
          FOR('d','0','<DIM>')
          code('<ARG>_l[d]=ZERO_<TYP>;')
          ENDFOR()
        else:
          FOR('d','0','<DIM>')
          code('<ARG>_l[d]=<ARG>[d+blockIdx.x*<DIM>];')
          ENDFOR()
      elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC and not op_color2:
        code('<TYP> <ARG>_l[<DIM>];')

    if not op_color2:
      for m in range (1,ninds+1):
        g_m = m -1
        v = [int(inds[i]==m) for i in range(len(inds))]
        v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m]
        if sum(v)>1 and sum(v_i)>0: #check this sum(v_i)
          if indaccs[m-1] == OP_INC:
            ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1
            code('<INDTYP> *arg'+str(invinds[m-1])+'_vec['+str(ind)+'] = {'); depth += 2;
            for n in range(0,nargs):
              if inds[n] == m:
                g_m = n
                code('<ARG>_l,')
            depth -= 2
            code('};')
#
# lengthy code for general case with indirection
#
    if ninds>0 and not op_color2:
      code('')
      if inc_stage==1:
        for g_m in range (0,ninds):
          if indaccs[g_m] == OP_INC:
            code('__shared__  int  *<INDARG>_map, <INDARG>_size;')
            code('__shared__  <INDTYP> *<INDARG>_s;')
        code('')
      if ind_inc:
        code('__shared__ int    nelems2, ncolor;')

      code('__shared__ int    nelem, offset_b;')
      code('')
      code('extern __shared__ char shared[];')
      code('')
      IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks')
      code('return;')
      ENDIF()
      IF('threadIdx.x==0')
      code('')
      comm('get sizes and shift pointers and direct-mapped data')
      code('')
      code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x  + block_offset];')
      code('')
      code('nelem    = nelems[blockId];')
      code('offset_b = offset[blockId];')
      code('')

      if ind_inc:
        code('nelems2  = blockDim.x*(1+(nelem-1)/blockDim.x);')
        code('ncolor   = ncolors[blockId];')
        code('')

      if inc_stage==1 and ind_inc:
        for g_m in range (0,ninds_staged):
          if indopts_staged[g_m-1] > 0:
            IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]]))
          code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds_staged)+'];')
          if indopts_staged[g_m-1] > 0:
            ENDIF()

        code('')
        for m in range (1,ninds_staged+1):
          g_m = m - 1
          c = [i for i in range(nargs) if inds_staged[i]==m]
          code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\
          '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds_staged)+'];')

        code('')
        comm('set shared memory pointers')
        code('int nbytes = 0;')

        for g_m in range(0,ninds_staged):
          code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_s = ('+typs[invinds_staged[g_m]]+' *) &shared[nbytes];')
          if g_m < ninds_staged-1:
            if indopts_staged[g_m-1] > 0:
              IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]]))
            code('nbytes    += ROUND_UP(ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size*sizeof('+typs[invinds_staged[g_m]]+')*'+dims[invinds_staged[g_m]]+');')
            if indopts_staged[g_m-1] > 0:
              ENDIF()


      ENDIF()
      code('__syncthreads(); // make sure all of above completed')
      code('')

      if inc_stage==1:
        for g_m in range(0,ninds):
          if indaccs[g_m] == OP_INC:
            FOR_INC('n','threadIdx.x','ind_ARG_size*<INDDIM>','blockDim.x')
            code('ind_ARG_s[n] = ZERO_<INDTYP>;')
            ENDFOR()
        if ind_inc:
          code('')
          code('__syncthreads();')
          code('')

      if ind_inc:
        FOR_INC('n','threadIdx.x','nelems2','blockDim.x')
        code('int col2 = -1;')
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')
        IF('n<nelem')
        comm('initialise local variables')

        for g_m in range(0,nargs):
          if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
            FOR('d','0','<DIM>')
            code('<ARG>_l[d] = ZERO_<TYP>;')
            ENDFOR()
      else:
        FOR_INC('n','threadIdx.x','nelem','blockDim.x')
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')

      #non-optional maps
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k):
          k = k + [(0*nargs+mapinds[g_m])] #non-opt
          k = k + [(1*nargs+mapinds[g_m])] #opt
          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];')

      #whatever didn't come up and is opt
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in      k)):
          k = k + [(optflags[g_m]*nargs+mapinds[g_m])]
          if optflags[g_m]==1:
            IF('optflags & 1<<'+str(optidxs[g_m]))

          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];')
          if optflags[g_m]==1:
            ENDIF()
 
      code('')
      for g_m in range (0,nargs):
        if accs[g_m] <> OP_INC: #TODO: add opt handling here
          u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
          if len(u) > 0 and vectorised[g_m] > 0:
            if accs[g_m] == OP_READ:
              line = 'const <TYP>* <ARG>_vec[] = {\n'
            else:
              line = '<TYP>* <ARG>_vec[] = {\n'

            v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
            first = [i for i in range(0,len(v)) if v[i] == 1]
            first = first[0]

            indent = ' '*(depth+2)
            for k in range(0,sum(v)):
              if soaflags[g_m]:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n'
              else:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n'
            line = line[:-2]+'};'
            code(line)
#
# simple version for global coloring
#
    elif ninds>0:
      code('int tid = threadIdx.x + blockIdx.x * blockDim.x;')
      IF('tid + start < end')
      code('int n = col_reord[tid + start];')
      comm('initialise local variables')
      #mapidx declarations
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
          k = k + [mapinds[g_m]]
          code('int map'+str(mapinds[g_m])+'idx;')

      #non-optional maps
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k):
          k = k + [(0*nargs+mapinds[g_m])] #non-opt
          k = k + [(1*nargs+mapinds[g_m])] #opt
          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];')

      #whatever didn't come up and is opt
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in      k)):
          k = k + [(optflags[g_m]*nargs+mapinds[g_m])]
          if optflags[g_m]==1:
            IF('optflags & 1<<'+str(optidxs[g_m]))

          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];')
          if optflags[g_m]==1:
            ENDIF()

      for g_m in range (0,nargs):
          u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
          if len(u) > 0 and vectorised[g_m] > 0:
            if accs[g_m] == OP_READ:
              line = 'const <TYP>* <ARG>_vec[] = {\n'
            else:
              line = '<TYP>* <ARG>_vec[] = {\n'

            v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
            first = [i for i in range(0,len(v)) if v[i] == 1]
            first = first[0]

            indent = ' '*(depth+2)
            for k in range(0,sum(v)):
              if soaflags[g_m]:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n'
              else:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n'
            line = line[:-2]+'};'
            code(line)




#
# simple alternative when no indirection
#
    else:
      code('')
      comm('process set elements')
      FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x')

#
# kernel call
#
    code('')
    comm('user-supplied kernel call')
    line = name+'_gpu('
    prefix = ' '*len(name)
    a = 0 #only apply indentation if its not the 0th argument
    indent =''
    for m in range (0, nargs):
      if a > 0:
        indent = '     '+' '*len(name)

      if maps[m] == OP_GBL:
        if accs[m] == OP_READ or accs[m] == OP_WRITE:
          line += rep(indent+'<ARG>,\n',m)
        else:
          line += rep(indent+'<ARG>_l,\n',m);
        a =a+1
      elif maps[m]==OP_MAP and  accs[m]==OP_INC and not op_color2:
        if vectorised[m]:
          if m+1 in unique_args:
            line += rep(indent+'<ARG>_vec,\n',m)
        else:
          line += rep(indent+'<ARG>_l,\n',m)
        a =a+1
      elif maps[m]==OP_MAP:
        if vectorised[m]:
          if m+1 in unique_args:
            line += rep(indent+'<ARG>_vec,\n',m)
        else:
          if soaflags[m]:
            line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx,'+'\n',m)
          else:
            line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx*<DIM>,'+'\n',m)
        a =a+1
      elif maps[m]==OP_ID:
        if ninds>0 and not op_color2:
          if soaflags[m]:
            line += rep(indent+'<ARG>+(n+offset_b),\n',m)
          else:
            line += rep(indent+'<ARG>+(n+offset_b)*<DIM>,\n',m)
          a =a+1
        else:
          if soaflags[m]:
            line += rep(indent+'<ARG>+n,\n',m)
          else:
            line += rep(indent+'<ARG>+n*<DIM>,\n',m)
          a =a+1
      else:
        print 'internal error 1 '

    code(line[0:-2]+');') #remove final ',' and \n

#
# updating for indirect kernels ...
#
    if ninds>0 and not op_color2:
      if ind_inc:
        code('col2 = colors[n+offset_b];')
        ENDIF()
        code('')
        comm('store local variables')
        code('')
        if inc_stage==1:
          for g_m in range(0,nargs):
            if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
              code('int <ARG>_map;')
          IF('col2>=0')
          for g_m in range(0,nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              code('<ARG>_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];')
          ENDIF()
          code('')

        FOR('col','0','ncolor')
        IF('col2==col')

        if inc_stage==1:
          for g_m in range(0,nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              if optflags[g_m]==1:
                IF('optflags & 1<<'+str(optidxs[g_m]))
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s[<ARG>_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size];')
                else:
                  code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+<ARG>_map*<DIM>];')
#          for g_m in range(0,nargs):
#            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('ind_arg'+str(inds[g_m]-1)+'_s[<ARG>_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size] = <ARG>_l['+str(d)+'];')
                else:
                  code('ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+<ARG>_map*<DIM>] = <ARG>_l['+str(d)+'];')
                
              if optflags[g_m]==1:
                ENDIF()
        else:
          for g_m in range(0,nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              if optflags[g_m]==1:
                IF('optflags & 1<<'+str(optidxs[g_m]))
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx];')
                else:
                  code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>];')
#          for g_m in range(0,nargs):
#            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx] = <ARG>_l['+str(d)+'];')
                else:
                  code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>] = <ARG>_l['+str(d)+'];')
              if optflags[g_m]==1:
                ENDIF()

        ENDFOR()
        code('__syncthreads();')
        ENDFOR()
    ENDFOR()

    if inc_stage:
      for g_m in range(0,ninds):
        if indaccs[g_m]==OP_INC:
          if indopts[g_m] > 0:
            IF('optflags & 1<<'+str(optidxs[indopts[g_m-1]]))
          if soaflags[invinds[g_m]]:
            FOR_INC('n','threadIdx.x','<INDARG>_size','blockDim.x')
            for d in range(0,int(dims[invinds[g_m]])):
              code('arg'+str(invinds[g_m])+'_l['+str(d)+'] = <INDARG>_s[n+'+str(d)+'*<INDARG>_size] + <INDARG>[<INDARG>_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'];')
            for d in range(0,int(dims[invinds[g_m]])):
              code('<INDARG>[<INDARG>_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'] = arg'+str(invinds[g_m])+'_l['+str(d)+'];')
            ENDFOR()
          else:
            FOR_INC('n','threadIdx.x','<INDARG>_size*<INDDIM>','blockDim.x')
            code('<INDARG>[n%<INDDIM>+<INDARG>_map[n/<INDDIM>]*<INDDIM>] += <INDARG>_s[n];')
            ENDFOR()
          if indopts[g_m] > 0:
            ENDIF()

#
# global reduction
#
    if reduct:
       code('')
       comm('global reductions')
       code('')
       for m in range (0,nargs):
         g_m = m
         if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE:
           FOR('d','0','<DIM>')
           if accs[m]==OP_INC:
             code('op_reduction<OP_INC>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);')
           elif accs[m]==OP_MIN:
             code('op_reduction<OP_MIN>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);')
           elif accs[m]==OP_MAX:
             code('op_reduction<OP_MAX>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);')
           else:
             print 'internal error: invalid reduction option'
             sys.exit(2);
           ENDFOR()
    depth -= 2
    code('}')
    code('')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm('host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg <ARG>){')
        code('')
      else:
        code('op_arg <ARG>,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL:
        code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')


    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('<ARG>.idx = 0;')
        code('args['+str(g_m)+'] = <ARG>;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')

      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = <ARG>;')

    if nopts>0:
      code('int optflags = 0;')
      for i in range(0,nargs):
        if optflags[i] == 1:
          IF('args['+str(i)+'].opt')
          code('optflags |= 1<<'+str(optidxs[i])+';')
          ENDIF()
    if nopts > 30:
      print 'ERROR: too many optional arguments to store flags in an integer'
#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('')

#
#   indirect bits
#
    if ninds>0:
      code('')
      code('int    ninds   = '+str(ninds)+';')
      line = 'int    inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

      code('')
      comm('get plan')
      code('#ifdef OP_PART_SIZE_'+ str(nk))
      code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
      code('#else')
      code('  int part_size = OP_part_size;')
      code('#endif')
      code('')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('op_mpi_halo_exchanges_cuda(set, nargs, args);')

    IF('set->size > 0')
    code('')

#
# kernel call for indirect version
#
    if ninds>0:
      if inc_stage==1 and ind_inc:
        code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);')
      elif op_color2:
        code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);')
      else:
        code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);')
      code('')


#
# transfer constants
#
    g = [i for i in range(0,nargs) if maps[i] == OP_GBL and (accs[i] == OP_READ or accs[i] == OP_WRITE)]
    if len(g)>0:
      comm('transfer constants to GPU')
      code('int consts_bytes = 0;')
      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m] == OP_WRITE):
          code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));')

      code('reallocConstArrays(consts_bytes);')
      code('consts_bytes = 0;')

      for m in range(0,nargs):
        if maps[m]==OP_GBL and (accs[m] == OP_READ or accs[m] == OP_WRITE):
          g_m = m
          code('<ARG>.data   = OP_consts_h + consts_bytes;')
          code('<ARG>.data_d = OP_consts_d + consts_bytes;')
          FOR('d','0','<DIM>')
          code('((<TYP> *)<ARG>.data)[d] = <ARG>h[d];')
          ENDFOR()
          code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));')
      code('mvConstArraysToDevice(consts_bytes);')
      code('')

      #managing constants
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');')
            code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));')
            ENDIF()
      if dir_soa<>-1:
          IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))')
          code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');')
          code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));')
          ENDIF()

#
# transfer global reduction initial data
#

    if ninds == 0:
      comm('set CUDA execution parameters')
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('  int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      code('  int nthread = OP_block_size;')
      comm('  int nthread = 128;')
      code('#endif')
      code('')
      code('int nblocks = 200;')
      code('')

    if reduct:
      comm('transfer global reduction data to GPU')
      if ninds>0:
        code('int maxblocks = 0;')
        FOR('col','0','Plan->ncolors')
        code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);')
        ENDFOR()
      else:
        code('int maxblocks = nblocks;')

      code('int reduct_bytes = 0;')
      code('int reduct_size  = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));')
          code('reduct_size   = MAX(reduct_size,sizeof(<TYP>));')

      code('reallocReductArrays(reduct_bytes);')
      code('reduct_bytes = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('<ARG>.data   = OP_reduct_h + reduct_bytes;')
          code('<ARG>.data_d = OP_reduct_d + reduct_bytes;')
          FOR('b','0','maxblocks')
          FOR('d','0','<DIM>')
          if accs[g_m]==OP_INC:
            code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = ZERO_<TYP>;')
          else:
            code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = <ARG>h[d];')
          ENDFOR()
          ENDFOR()
          code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));')
      code('mvReductArraysToDevice(reduct_bytes);')
      code('')

#
# kernel call for indirect version
#
    if ninds>0:
      comm('execute plan')
      if not op_color2:
        code('')
        code('int block_offset = 0;')
      FOR('col','0','Plan->ncolors')
      IF('col==Plan->ncolors_core')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      code('int nthread = OP_block_size;')
      code('#endif')
      code('')
      if op_color2:
        code('int start = Plan->col_offsets[0][col];')
        code('int end = Plan->col_offsets[0][col+1];')
        code('int nblocks = (end - start - 1)/nthread + 1;')
      else:
        code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],')
        code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);')
        IF('Plan->ncolblk[col] > 0')

      if reduct or (inc_stage==1 and ind_inc):
        if reduct and inc_stage==1:
          code('int nshared = MAX(Plan->nshared,reduct_size*nthread);')
        elif reduct:
          code('int nshared = reduct_size*nthread;')
        else:
          code('int nshared = Plan->nsharedCol[col];')
        code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')
      else:
        code('op_cuda_'+name+'<<<nblocks,nthread>>>(')

      if nopts > 0:
        code('optflags,')
      for m in range(1,ninds+1):
        g_m = invinds[m-1]
        code('(<TYP> *)<ARG>.data_d,')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('arg'+str(g_m)+'.map_data_d, ')
      for g_m in range(0,nargs):
        if inds[g_m]==0:
          code('(<TYP>*)<ARG>.data_d,')

      if inc_stage==1 and ind_inc:
        code('Plan->ind_map,')
        code('Plan->loc_map,')
        code('Plan->ind_sizes,')
        code('Plan->ind_offs,')
      if op_color2:
        code('start,')
        code('end,')
        code('Plan->col_reord,')
      else:
        code('block_offset,')
        code('Plan->blkmap,')
        code('Plan->offset,')
        code('Plan->nelems,')
        code('Plan->nthrcol,')
        code('Plan->thrcol,')
        code('Plan->ncolblk[col],')
      code('set->size+set->exec_size);')
      code('')
      if reduct:
        comm('transfer global reduction data back to CPU')
        IF('col == Plan->ncolors_owned-1')
        code('mvReductArraysToHost(reduct_bytes);')
        ENDIF()
      if not op_color2:
        ENDFOR() #TODO sztem ez forditva van...
        code('block_offset += Plan->ncolblk[col];')
      ENDIF()
#
# kernel call for direct version
#
    else:
      if reduct:
        code('int nshared = reduct_size*nthread;')
        code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')
      else:
        code('op_cuda_'+name+'<<<nblocks,nthread>>>(')

      indent = '  '#*(len(name)+42)
      if nopts > 0:
        code(indent+'optflags,')
      for g_m in range(0,nargs):
        if g_m > 0:
          code(indent+'(<TYP> *) <ARG>.data_d,')
        else:
          code(indent+'(<TYP> *) <ARG>.data_d,')

      code(indent+'set->size );')

    if ninds>0:
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')


#
# transfer global reduction initial data
#
    if reduct:
      if ninds == 0:
        comm('transfer global reduction data back to CPU')
        code('mvReductArraysToHost(reduct_bytes);')

      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE:
          FOR('b','0','maxblocks')
          FOR('d','0','<DIM>')
          if accs[m]==OP_INC:
            code('<ARG>h[d] = <ARG>h[d] + ((<TYP> *)<ARG>.data)[d+b*<DIM>];')
          elif accs[m]==OP_MIN:
            code('<ARG>h[d] = MIN(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);')
          elif accs[m]==OP_MAX:
            code('<ARG>h[d] = MAX(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);')
          ENDFOR()
          ENDFOR()

          code('<ARG>.data = (char *)<ARG>h;')
          code('op_mpi_reduce(&<ARG>,<ARG>h);')
          
    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        code('')
        code('mvConstArraysToHost(consts_bytes);')
        break

    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        FOR('d','0','<DIM>')
        code('<ARG>h[d] = ((<TYP> *)<ARG>.data)[d];') 
        ENDFOR()
        code('<ARG>.data = (char *)<ARG>h;')
        code('op_mpi_reduce(&<ARG>,<ARG>h);')

    ENDIF()
    code('op_mpi_set_dirtybit_cuda(nargs, args);')

#
# update kernel record
#

    code('cutilSafeCall(cudaDeviceSynchronize());')
    comm('update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('<ARG>.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' <ARG>.size;')
          else:
            code(line+' <ARG>.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()
    depth = depth - 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('cuda'):
        os.makedirs('cuda')
    fid = open('cuda/'+name+'_kernel.cu','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text = ''

  comm('global constants')

  code('#ifndef MAX_CONST_SIZE')
  code('#define MAX_CONST_SIZE 128')
  code('#endif')
  code('')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda;')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'

      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda['+num+'];')
  code('')

  comm('header')

  if os.path.exists('./user_types.h'):
    code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif')
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"')
  code('#include "op_cuda_rt_support.h"')
  code('#include "op_cuda_reduction.h"')

  code('')
  code('void op_decl_const_char(int dim, char const *type,')
  code('int size, char *dat, char const *name){')
  depth = depth + 2

  code('if (!OP_hybrid_gpu) return;')
  for nc in range(0,len(consts)):
    IF('!strcmp(name,"'+consts[nc]['name']+'")')
    if consts[nc]['dim'] < 0:
      IF('!strcmp(name,"'+consts[nc]['name']+'") && size>MAX_CONST_SIZE) {')
      code('printf("error: MAX_CONST_SIZE not big enough\n"); exit(1);')
      ENDIF()
    code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*size));')
    ENDIF()
    code('else ')

  code('{')
  depth = depth + 2
  code('printf("error: unknown const name\\n"); exit(1);')
  ENDIF()


  depth = depth - 2
  code('}')
  code('')
  comm('user kernel files')

  for nk in range(0,len(kernels)):
    file_text = file_text +\
    '#include "'+kernels[nk]['name']+'_kernel.cu"\n'

  master = master.split('.')[0]
  fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
Пример #3
0
def op2_gen_mpi_vec(master, date, consts, kernels):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  grouped = 0

  any_soa = 0
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
            ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
            unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])
#
# set three logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] != OP_READ:
        j = i
    reduct = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP :
        j = i
    indirect_kernel = j >= 0

    if nargs != nargs_novec:
      return
####################################################################################
#  generate the user kernel function - creating versions for vectorisation as needed
####################################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

#
# First original version
#
    comm('user function')
    file_name = decl_filepath

    f = open(file_name, 'r')
    kernel_text = f.read()
    file_text += kernel_text
    f.close()

    ## Clang compiler can struggle to vectorize a loop if it uses a mix of
    ## Python-generated simd arrays for indirect data AND pointers to direct
    ## data. Fix by also generating simd arrays for direct data:
    do_gen_direct_simd_arrays = True

#
# Modified vectorisable version if its an indirect kernel
# - direct kernels can be vectorised without modification
#
    if indirect_kernel:
      code('#ifdef VECTORIZE')
      comm('user function -- modified for vectorisation')
      f = open(file_name, 'r')
      kernel_text = f.read()
      f.close()

      kernel_text = op2_gen_common.comment_remover(kernel_text)
      kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

      p = re.compile('void\\s+\\b'+name+'\\b')
      i = p.search(kernel_text).start()

      if(i < 0):
        print("\n********")
        print("Error: cannot locate user kernel function name: "+name+" - Aborting code generation")
        exit(2)
      i2 = i

      #i = kernel_text[0:i].rfind('\n') #reverse find
      j = kernel_text[i:].find('{')
      k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}')
      signature_text = kernel_text[i:i+j]
      l = signature_text[0:].find('(')
      head_text = signature_text[0:l] #save function name
      m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
      signature_text = signature_text[l+1:m]
      body_text = kernel_text[i+j+1:k]

      ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>:
      body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master))


      # check for number of arguments
      nargs_actual = len(signature_text.split(','))
      if nargs_actual != nargs:
          print(('Error parsing user kernel({0}): must have {1} arguments (instead it has {2})'.format(name, nargs, nargs_actual)))
          return

      new_signature_text = ''
      for i in range(0,nargs):
        var = signature_text.split(',')[i].strip()

        if do_gen_direct_simd_arrays:
          do_gen_simd_array_arg = maps[i] != OP_GBL
        else:
          do_gen_simd_array_arg = maps[i] != OP_GBL and maps[i] != OP_ID
        if do_gen_simd_array_arg:
          #remove * and add [*][SIMD_VEC]
          var = var.replace('*','')
          #locate var in body and replace by adding [idx]
          length = len(re.compile('\\s+\\b').split(var))
          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()

          #print var2

          body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
          array_access_pattern = '\[[\w\(\)\+\-\*\s\\\\]*\]'

          ## It has been observed that vectorisation can fail on loops with increments,
          ## but replacing them with writes succeeds.
          ## For example with Clang on particular loops, vectorisation fails with message:
          ##   "loop not vectorized: loop control flow is not understood by vectorizer"
          ## replacing increments with writes solves this.
          ## Replacement is data-safe due to use of local/intermediate SIMD arrays.
          ## Hopefully the regex is matching all increments.
          ## And for loops that were being vectorised, this change can give a small perf boost.
          if maps[i] == OP_MAP and accs[i] == OP_INC:
            ## Replace 'var' increments with writes:
            body_text = re.sub(r'('+var2+array_access_pattern+'\s*'+')'+re.escape("+="), r'\1'+'=', body_text)

          ## Append vector array access:
          body_text = re.sub(r'('+var2+array_access_pattern+')', r'\1'+'[idx]', body_text)

          var = var + '[][SIMD_VEC]'
          #var = var + '[restrict][SIMD_VEC]'
        new_signature_text +=  var+', '


      #add ( , idx and )
      signature_text = "#if defined __clang__ || defined __GNUC__\n"
      signature_text += "__attribute__((always_inline))\n"
      signature_text += "#endif\n"
      signature_text += "inline " + head_text + '( '+new_signature_text + 'int idx ) {'
      #finally update name
      signature_text = signature_text.replace(name,name+'_vec')

      #print head_text
      #print signature_text
      #print  body_text

      file_text += signature_text + body_text + '}\n'
      code('#endif');



##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm(' host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg <ARG>){');
        code('')
      else:
        code('op_arg <ARG>,')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('<ARG>.idx = 0;')
        code('args['+str(g_m)+'] = <ARG>;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')
      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = <ARG>;')

#
# create aligned pointers
#
    comm('create aligned pointers for dats')
    for g_m in range (0,nargs):
        if maps[g_m] != OP_GBL:
          if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE):
            code('ALIGNED_<TYP>       <TYP> * __restrict__ ptr'+\
            str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
            #code('<TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN)))  ptr'+\
            #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
            code('DECLARE_PTR_ALIGNED(ptr'+str(g_m)+',<TYP>_ALIGN);')

          else:
            code('ALIGNED_<TYP> const <TYP> * __restrict__ ptr'+\
            str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
            code('DECLARE_PTR_ALIGNED(ptr'+str(g_m)+',<TYP>_ALIGN);')
            #code('const <TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN)))  ptr'+\
            #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')



#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('')

#
#   indirect bits
#
    if ninds>0:
      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()

    code('')
    if grouped:
      code('int exec_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 1);')
    else:
      code('int exec_size = op_mpi_halo_exchanges(set, nargs, args);')

    code('')
    IF('exec_size >0')
    code('')

#
# kernel call for indirect version
#
    if ninds>0:
      code('#ifdef VECTORIZE')

      code('#pragma novector')
      FOR2('n','0','(exec_size/SIMD_VEC)*SIMD_VEC','SIMD_VEC')
      #initialize globals
      for g_m in range(0,nargs):
        if maps[g_m] == OP_GBL:
          code('<TYP> dat{0}[SIMD_VEC];'.format(g_m))
          FOR('i','0','SIMD_VEC')
          if accs[g_m] == OP_INC:
            code('dat{0}[i] = 0.0;'.format(g_m))
          elif accs[g_m] == OP_MAX:
            code('dat{0}[i] = -INFINITY;'.format(g_m))
          elif accs[g_m] == OP_MIN:
            code('dat{0}[i] = INFINITY;'.format(g_m))
          elif accs[g_m] == OP_READ:
            code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m))
          ENDFOR()

      code('if (n<set->core_size && n>0 && n % OP_mpi_test_frequency == 0)')
      code('  op_mpi_test_all(nargs,args);')
      IF('(n+SIMD_VEC >= set->core_size) && (n+SIMD_VEC-set->core_size < SIMD_VEC)')
      if grouped:
        code('op_mpi_wait_all_grouped(nargs, args, 1);')
      else:
        code('op_mpi_wait_all(nargs, args);')
      ENDIF()
      for g_m in range(0,nargs):
        if do_gen_direct_simd_arrays:
          if (maps[g_m] in [OP_MAP, OP_ID]) and (accs[g_m] in [OP_READ, OP_RW, OP_WRITE, OP_INC]):
            code('ALIGNED_<TYP> <TYP> dat'+str(g_m)+'[<DIM>][SIMD_VEC];')
        else:
          if maps[g_m] == OP_MAP and (accs[g_m] in [OP_READ, OP_RW, OP_WRITE, OP_INC]):
            code('ALIGNED_<TYP> <TYP> dat'+str(g_m)+'[<DIM>][SIMD_VEC];')

      #setup gathers
      idx_map_template = "int idx{0}_<DIM> = <DIM> * arg{1}.map_data[(n+i) * arg{1}.map->dim + {2}];"
      idx_id_template  = "int idx{0}_<DIM> = <DIM> * (n+i);"
      code('#pragma omp simd simdlen(SIMD_VEC)')
      FOR('i','0','SIMD_VEC')
      if nmaps > 0:
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP :
            if (accs[g_m] in [OP_READ, OP_RW, OP_WRITE]):#and (not mapinds[g_m] in k):
              code(idx_map_template.format(g_m, invmapinds[inds[g_m]-1], idxs[g_m]))
          elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID :
            code(idx_id_template.format(g_m))
      code('')

      init_dat_template = "dat{0}[{1}][i] = (ptr{0})[idx{0}_<DIM> + {1}];"
      zero_dat_template = "dat{0}[{1}][i] = 0.0;"
      for g_m in range(0,nargs):
        if do_gen_direct_simd_arrays:
          ## also 'gather' directly-accessed data, because SOME compilers
          ## struggle to vectorise otherwise (e.g. Clang).
          if maps[g_m] != OP_GBL :
            if accs[g_m] in [OP_READ, OP_RW]:
              for d in range(0,int(dims[g_m])):
                code(init_dat_template.format(g_m, d))
              code('')
            elif accs[g_m] == OP_INC:
              for d in range(0,int(dims[g_m])):
                code(zero_dat_template.format(g_m, d))
              code('')
        else:
          if maps[g_m] == OP_MAP :
            if accs[g_m] in [OP_READ, OP_RW]:#and (not mapinds[g_m] in k):
              for d in range(0,int(dims[g_m])):
                init_dat_str = init_dat_template.format(g_m, d)
                code(init_dat_str)
              code('')
            elif (accs[g_m] == OP_INC):
              for d in range(0,int(dims[g_m])):
                zero_dat_str = zero_dat_template.format(g_m, d)
                code(zero_dat_str)
              code('')
          else: #globals
            if (accs[g_m] == OP_INC):
              # for d in range(0,int(dims[g_m])):
              #   code('dat'+str(g_m)+'[i] = 0.0;')
              # code('')
              pass

      ENDFOR()
      #kernel call
      code('#pragma omp simd simdlen(SIMD_VEC)')
      FOR('i','0','SIMD_VEC')
      line = name+'_vec('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if (not do_gen_direct_simd_arrays) and maps[g_m] == OP_ID:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)],'
        elif maps[g_m] == OP_GBL and accs[g_m] == OP_READ:
          line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data,'
        elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC:
          line = line + indent +'&dat'+str(g_m)+'[i],'
        else:
          line = line + indent + 'dat'+str(g_m)+','
      line = line +indent +'i);'
      code(line)
      ENDFOR()
      #do the scatters
      FOR('i','0','SIMD_VEC')
      if nmaps > 0:
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP :
            if (accs[g_m] in [OP_INC, OP_RW, OP_WRITE]):#and (not mapinds[g_m] in k):
              code(idx_map_template.format(g_m, invmapinds[inds[g_m]-1], idxs[g_m]))
          elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID :
            if (accs[g_m] in [OP_INC, OP_RW, OP_WRITE]):
              code(idx_id_template.format(g_m))
      code('')
      dat_scatter_inc_template = "(ptr{0})[idx{0}_<DIM> + {1}] += dat{0}[{1}][i];"
      dat_scatter_wr_template  = "(ptr{0})[idx{0}_<DIM> + {1}] = dat{0}[{1}][i];"
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP :
          if (accs[g_m] == OP_INC ):
            for d in range(0,int(dims[g_m])):
              code(dat_scatter_inc_template.format(g_m, d))
            code('')
          elif accs[g_m] in [OP_WRITE, OP_RW]:
            for d in range(0,int(dims[g_m])):
              code(dat_scatter_wr_template.format(g_m, d))
            code('')
        elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID:
          ## also scatter directly-written data
          if (accs[g_m] == OP_INC ):
            for d in range(0,int(dims[g_m])):
              code(dat_scatter_inc_template.format(g_m, d))
          elif accs[g_m] in [OP_WRITE, OP_RW]:
            for d in range(0,int(dims[g_m])):
              code(dat_scatter_wr_template.format(g_m, d))
            code('')
      ENDFOR()

      #do reductions
      for g_m in range(0,nargs):
        if maps[g_m] == OP_GBL:
          FOR('i','0','SIMD_VEC')
          if accs[g_m] == OP_INC:
            code('*(<TYP>*)arg'+str(g_m)+'.data += dat'+str(g_m)+'[i];')
          elif accs[g_m] == OP_MAX:
            code('*(<TYP>*)arg'+str(g_m)+'.data = MAX(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);')
          elif accs[g_m] == OP_MIN:
            code('*(<TYP>*)arg'+str(g_m)+'.data = MIN(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);')
          ENDFOR()


      ENDFOR()
      code('')
      comm('remainder')
      FOR('n','(exec_size/SIMD_VEC)*SIMD_VEC','exec_size')
      depth = depth -2
      code('#else')
      FOR('n','0','exec_size')
      depth = depth -2
      code('#endif')
      depth = depth +2
      IF('n==set->core_size')
      if grouped:
        code('op_mpi_wait_all_grouped(nargs, args, 1);')
      else:
        code('op_mpi_wait_all(nargs, args);')
      ENDIF()
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')
      #do non-optional ones
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]):
            k = k + [mapinds[g_m]]
            code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];')
      #do optional ones
      if nmaps > 0:
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            if optflags[g_m]:
              IF('<ARG>.opt')
            else:
              k = k + [mapinds[g_m]]
            code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];')
            if optflags[g_m]:
              ENDIF()

      code('')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * n]'
        if maps[g_m] == OP_MAP:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()

#
# kernel call for direct version
#
    else:
      code('#ifdef VECTORIZE')

      code('#pragma novector')
      FOR2('n','0','(exec_size/SIMD_VEC)*SIMD_VEC','SIMD_VEC')

	  #initialize globals
      for g_m in range(0,nargs):
        if maps[g_m] == OP_GBL:
          code('<TYP> dat{0}[SIMD_VEC];'.format(g_m))
          FOR('i','0','SIMD_VEC')
          if accs[g_m] == OP_INC:
            code('dat{0}[i] = 0.0;'.format(g_m))
          elif accs[g_m] == OP_MAX:
            code('dat{0}[i] = -INFINITY;'.format(g_m))
          elif accs[g_m] == OP_MIN:
            code('dat{0}[i] = INFINITY;'.format(g_m))
          elif accs[g_m] == OP_READ:
            code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m))
          ENDFOR()

      code('#pragma omp simd simdlen(SIMD_VEC)')
      FOR('i','0','SIMD_VEC')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)]'
        if maps[g_m] == OP_MAP:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'&dat'+str(g_m)+'[i]'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()
      #do reductions
      for g_m in range(0,nargs):
        if maps[g_m] == OP_GBL:
          FOR('i','0','SIMD_VEC')
          if accs[g_m] == OP_INC:
            code('*(<TYP>*)arg'+str(g_m)+'.data += dat'+str(g_m)+'[i];')
          elif accs[g_m] == OP_MAX:
            code('*(<TYP>*)arg'+str(g_m)+'.data = MAX(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);')
          elif accs[g_m] == OP_MIN:
            code('*(<TYP>*)arg'+str(g_m)+'.data = MIN(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);')
          ENDFOR()
      ENDFOR()

      comm('remainder')
      FOR ('n','(exec_size/SIMD_VEC)*SIMD_VEC','exec_size')
      depth = depth -2
      code('#else')
      FOR('n','0','exec_size')
      depth = depth -2
      code('#endif')
      depth = depth +2
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+'*n]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()
    ENDIF()
    code('')

    #zero set size issues
    if ninds>0:
      IF('exec_size == 0 || exec_size == set->core_size')
      if grouped:
        code('op_mpi_wait_all_grouped(nargs, args, 1);')
      else:
        code('op_mpi_wait_all(nargs, args);')
      ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
    comm(' combine reduction data')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ:
        code('op_mpi_reduce(&<ARG>,('+typs[g_m]+'*)<ARG>.data);')

    code('op_mpi_set_dirtybit(nargs, args);')
    code('')

#
# update kernel record
#

    comm(' update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if maps[g_m]!=OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' <ARG>.size;')
          else:
            code(line+' <ARG>.size * 2.0f;')
    else:
      names = []
      for g_m in range(0,ninds):
        mult=''
        if indaccs[g_m] != OP_WRITE and indaccs[g_m] != OP_READ:
          mult = ' * 2.0f'
        if not var[invinds[g_m]] in names:
          code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[g_m])+'.size'+mult+';')
          names = names + [var[invinds[g_m]]]
      for g_m in range(0,nargs):
        mult=''
        if accs[g_m] != OP_WRITE and accs[g_m] != OP_READ:
          mult = ' * 2.0f'
        if not var[g_m] in names:
          names = names + [var[invinds[g_m]]]
          if maps[g_m] == OP_ID:
            code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';')
          elif maps[g_m] == OP_GBL:
            code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[inds[g_m]-1])+'.map->dim * 4.0f;')

    depth -= 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('vec'):
        os.makedirs('vec')
    fid = open('vec/'+name+'_veckernel.cpp','w')
    date = datetime.datetime.now()
    #fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n')
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text =''

  code('#define double_ALIGN 128')
  code('#define float_ALIGN 64')
  code('#define int_ALIGN 64')
  code('#ifdef VECTORIZE')
  code('#define SIMD_VEC 4')
  code('#define ALIGNED_double __attribute__((aligned(double_ALIGN)))')
  code('#define ALIGNED_float __attribute__((aligned(float_ALIGN)))')
  code('#define ALIGNED_int __attribute__((aligned(int_ALIGN)))')
  code('  #ifdef __ICC')
  code('    #define DECLARE_PTR_ALIGNED(X, Y) __assume_aligned(X, Y)')
  code('  #else')
  code('    #define DECLARE_PTR_ALIGNED(X, Y)')
  code('  #endif')
  code('#else')
  code('#define ALIGNED_double')
  code('#define ALIGNED_float')
  code('#define ALIGNED_int')
  code('#define DECLARE_PTR_ALIGNED(X, Y)')
  code('#endif')
  code('')

  comm(' global constants       ')

  for nc in range (0,len(consts)):
    if not consts[nc]['user_declared']:
      if consts[nc]['dim']==1:
        code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
      else:
        if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0:
          num = str(consts[nc]['dim'])
        else:
          num = 'MAX_CONST_SIZE'
        code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];')
  code('')

  comm(' header                 ')

  if os.path.exists('./user_types.h'):
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"')
  code('')

  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_veckernel.cpp"')
  master = master.split('.')[0]
  fid = open('vec/'+master.split('.')[0]+'_veckernels.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
Пример #4
0
def op2_gen_openmp4(master, date, consts, kernels):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]
  op2_compiler = os.getenv('OP2_COMPILER','0');
  any_soa = 0
  maptype = 'map'
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
            ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
            unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]
#
# set two logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ:
        j = i
    reduct = j >= 0

##########################################################################
#  start with the user kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

    comm('user function')

    #strides for SoA
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;')
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;')
      dir_soa = -1
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1):
          code('int direct_'+name+'_stride_OP2CONSTANT;')
          code('int direct_'+name+'_stride_OP2HOST=-1;')
          dir_soa = g_m
          break

    comm('user function')
    file_name = decl_filepath

    f = open(file_name, 'r')
    kernel_text = f.read()
    f.close()

    kernel_text = op2_gen_common.comment_remover(kernel_text)
    kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

    p = re.compile('void\\s+\\b'+name+'\\b')
    i = p.search(kernel_text).start()

    if(i < 0):
      print "\n********"
      print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation"
      exit(2)
    i2 = i

    #i = kernel_text[0:i].rfind('\n') #reverse find
    j = kernel_text[i:].find('{')
    k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}')
    signature_text = kernel_text[i:i+j]
    l = signature_text[0:].find('(')
    head_text = signature_text[0:l] #save function name
    m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
    signature_text = signature_text[l+1:m]
    body_text = kernel_text[i+j+1:k]

    ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>:
    body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master))

    # check for number of arguments
    if len(signature_text.split(',')) != nargs_novec:
        print 'Error parsing user kernel(%s): must have %d arguments' \
              % name, nargs
        return

    for i in range(0,nargs_novec):
        var = signature_text.split(',')[i].strip()
        if kernels[nk]['soaflags'][i]:
          var = var.replace('*','')
          #locate var in body and replace by adding [idx]
          length = len(re.compile('\\s+\\b').split(var))
          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()

          if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
            body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \
                op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
          else:
            body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
            body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \
                op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)

    for nc in range(0,len(consts)): 
      varname = consts[nc]['name']
      body_text = re.sub('\\b'+varname+'\\b', varname+'_ompkernel',body_text)
#      if consts[nc]['dim'] == 1:
#        body_text = re.sub(varname+'(?!\w)', varname+'_ompkernel', body_text)
#      else:
#        body_text = re.sub('\*'+varname+'(?!\[)', varname+'[0]', body_text)
#        body_text = re.sub(r''+varname+'\[([A-Za-z0-9]*)\]'+'', varname+r'_ompkernel[\1]', body_text)

    vec = 0
    for n in range(0,nargs):
      if (vectorised[n] == 1):
        vec = 1
    kernel_params = [ var.strip() for var in signature_text.split(',')]
    if vec:
      new_kernel_params = []
      for m in range(0,nargs_novec):
        if int(kernels[nk]['idxs'][m])<0 and int(kernels[nk]['maps'][m]) == OP_MAP:
          new_kernel_params = new_kernel_params + [kernel_params[m]]*int(-1*int(kernels[nk]['idxs'][m]))
        else:
          new_kernel_params = new_kernel_params + [kernel_params[m]]
      kernel_params = new_kernel_params

    # collect constants used by kernel
    kernel_consts = []
    for nc in range(0,len(consts)):
      if body_text.find(consts[nc]['name']+'_ompkernel') != -1:
        kernel_consts.append(nc)

############################################################
#  omp4 function call definition
############################################################
    code('')
    func_call_signaure_text = 'void ' + name + '_omp4_kernel('
    params = ''
    indent = '\n' + '  '
    k = []
    for g_m in range(0, nargs):
      if maps[g_m] == OP_GBL:
        params += indent + rep('<TYP> *<ARG>,',g_m)
      if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
        k = k + [invmapinds[inds[g_m]-1]]
        params += indent +  'int *map'+str(mapinds[g_m])+','
        if maptype == 'map':
          params += indent +  'int map'+str(mapinds[g_m])+'size,'
      if maps[g_m] == OP_ID:
        params += indent + rep('<TYP> *data'+str(g_m)+',', g_m)
        if maptype == 'map':
          params += indent +  'int dat'+str(g_m)+'size,'
    for m in range(1,ninds+1):
      g_m = invinds[m-1]
      params += indent + rep('<TYP> *data'+str(g_m)+',', g_m)
      if maptype == 'map':
        params += indent +  'int dat'+str(g_m)+'size,'
    if ninds>0:
      # add indirect kernel specific params to kernel func call 
      params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,'
    else:
      # add direct kernel specific params to kernel func call 
      params += indent + 'int count,'
    params += indent + 'int num_teams,' + indent + 'int nthread'
    #add strides for SoA to params
    if any_soa:
      indent = ','+indent
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            params += indent + 'int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT'
      
      if dir_soa<>-1:
        params += indent + 'int direct_'+name+'_stride_OP2CONSTANT'
    if nopts>0:
      params += ', int optflags'
    code(func_call_signaure_text+params+');')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm(' host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg <ARG>){');
        code('')
      else:
        code('op_arg <ARG>,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL: #and accs[g_m] <> OP_READ:
        code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('<ARG>.idx = 0;')
        code('args['+str(g_m)+'] = <ARG>;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')
      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = <ARG>;')

    if nopts>0:
      code('int optflags = 0;')
      for i in range(0,nargs):
        if optflags[i] == 1:
          IF('args['+str(i)+'].opt')
          code('optflags |= 1<<'+str(optidxs[i])+';')
          ENDIF()
    if nopts > 30:
      print 'ERROR: too many optional arguments to store flags in an integer'
#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('')

#
#   indirect bits
#
    if ninds>0:
      code('int  ninds   = '+str(ninds)+';')
      line = 'int  inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()
      code('')
      comm(' get plan')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# get part and block size
#
    code('')
    code('#ifdef OP_PART_SIZE_'+ str(nk))
    code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
    code('#else')
    code('  int part_size = OP_part_size;')
    code('#endif')
    code('#ifdef OP_BLOCK_SIZE_'+ str(nk))
    code('  int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
    code('#else')
    code('  int nthread = OP_block_size;')
    code('#endif')

    code('')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL: #and accs[g_m]<>OP_READ:
        if not dims[g_m].isdigit() or int(dims[g_m]) > 1:
          print 'ERROR: OpenMP 4 does not support multi-dimensional variables'
          exit(-1)
        code('<TYP> <ARG>_l = <ARG>h[0];')

    if ninds > 0:
      code('')
      code('int ncolors = 0;')
      code('int set_size1 = set->size + set->exec_size;')
    code('')
    IF('set_size >0')
    #managing constants
    if any_soa:
      code('')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST;')
            ENDIF()
      if dir_soa<>-1:
          IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))')
          code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');')
          code('direct_'+name+'_stride_OP2CONSTANT = direct_'+name+'_stride_OP2HOST;')
          ENDIF()
    code('')
    comm('Set up typed device pointers for OpenMP')
    if nmaps > 0:
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
          k = k + [invmapinds[inds[g_m]-1]]
          code('int *map'+str(mapinds[g_m])+' = arg'+str(invmapinds[inds[g_m]-1])+'.map_data_d;')
          if maptype == 'map':
            code(' int map'+str(mapinds[g_m])+'size = arg'+str(invmapinds[inds[g_m]-1])+'.map->dim * set_size1;') 

    code('')
    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        code(typs[g_m]+'* data'+str(g_m)+' = ('+typs[g_m]+'*)arg'+str(g_m)+'.data_d;')
        if maptype == 'map':
          if optflags[g_m]:
              code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')
          else:
            code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')

    for m in range(1,ninds+1):
      g_m = invinds[m-1]
      code('<TYP> *data'+str(g_m)+' = (<TYP> *)<ARG>.data_d;')
      if maptype == 'map':
        if optflags[g_m]:
            code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')
        else:
          code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')
    
#
# prepare kernel params for indirect version
#
    if ninds>0:
      code('')
      code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);')
      code('ncolors = Plan->ncolors;')
      code('int *col_reord = Plan->col_reord;')
      code('')
      comm(' execute plan')
      FOR('col','0','Plan->ncolors')
      IF('col==1')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()
      code('int start = Plan->col_offsets[0][col];')
      code('int end = Plan->col_offsets[0][col+1];')
      code('')
#
# kernel function call
#
    indent = '\n' + ' ' * (depth+2)
    call_params = ','.join([ indent + re.sub(r'\*arg(\d+)',r'&arg\1_l',param.strip().split(' ')[-1]) for param in params.split(',')])
    call_params = call_params.replace('*','')
    # set params for indirect version
    if ninds>0:
      call_params = call_params.replace('num_teams','part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread')
    # set params for direct version
    else:
      call_params = re.sub('count','set->size',call_params);
      call_params = call_params.replace('num_teams','part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread') 
    code(func_call_signaure_text.split(' ')[-1]+call_params+');')
    code('')

    if ninds>0:
      if reduct:
        comm(' combine reduction data')
        IF('col == Plan->ncolors_owned-1')
        for g_m in range(0,nargs):
          if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
            if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE:
              code('<ARG>h[0] = <ARG>_l;')
            elif accs[g_m]==OP_MIN:
              code('<ARG>h[0]  = MIN(<ARG>h[0],<ARG>_l);')
            elif  accs[g_m]==OP_MAX:
              code('<ARG>h[0]  = MAX(<ARG>h[0],<ARG>_l);')
            else:
              error('internal error: invalid reduction option')
        ENDIF()
      ENDFOR()
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')

    ENDIF()
    code('')

    #zero set size issues
    if ninds>0:
      IF('set_size == 0 || set_size == set->core_size || ncolors == 1')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
    comm(' combine reduction data')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
        if ninds==0: #direct version only
          if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE:
            code('<ARG>h[0] = <ARG>_l;')
          elif accs[g_m]==OP_MIN:
            code('<ARG>h[0]  = MIN(<ARG>h[0],<ARG>_l);')
          elif accs[g_m]==OP_MAX:
            code('<ARG>h[0]  = MAX(<ARG>h[0],<ARG>_l);')
          else:
            print 'internal error: invalid reduction option'
        if typs[g_m] == 'double': #need for both direct and indirect
          code('op_mpi_reduce_double(&<ARG>,<ARG>h);')
        elif typs[g_m] == 'float':
          code('op_mpi_reduce_float(&<ARG>,<ARG>h);')
        elif typs[g_m] == 'int':
          code('op_mpi_reduce_int(&<ARG>,<ARG>h);')
        else:
          print 'Type '+typs[g_m]+' not supported in OpenMP4 code generator, please add it'
          exit(-1)


    code('op_mpi_set_dirtybit_cuda(nargs, args);')
    code('')

#
# update kernel record
#

    code('if (OP_diags>1) deviceSync();')
    comm(' update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('<ARG>.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' <ARG>.size;')
          else:
            code(line+' <ARG>.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()

    depth -= 2
    code('}')

##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('openmp4'):
        os.makedirs('openmp4')
    fid = open('openmp4/'+name+'_omp4kernel.cpp','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

##############################################################
# generate ****_omp4kernel_func.cpp
##############################################################
    file_text = ''

    if CPP:
      includes = op2_gen_common.extract_includes(kernel_text)
      if len(includes) > 0:
        for include in includes:
          code(include)
        code("")

    code(func_call_signaure_text+params+'){')
    code('')
    depth += 2
    for g_m in range(0, nargs):
      if maps[g_m] == OP_GBL:
        code('<TYP> <ARG>_l = *<ARG>;')
    line = '#pragma omp target teams'
    if op2_compiler == 'clang':
      line +=' distribute parallel for schedule(static,1)\\\n' + (depth+2)*' '
    line +=' num_teams(num_teams) thread_limit(nthread) '
    map_clause = ''
    if maptype == 'map':
      map_clause = 'map(to:'
    elif maptype == 'is_device_ptr':
      map_clause = 'is_device_ptr('
       
    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        if maptype == 'map':
          map_clause += 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],'
        else:
          map_clause += 'data'+str(g_m)+','
    if map_clause != 'is_device_ptr(' and map_clause != 'map(to:':
      map_clause = map_clause[:-1]+')'
      line += map_clause
    # mapping global consts
    if len(kernel_consts) != 0:
      line += ' \\\n' + (depth+2)*' ' + 'map(to:'
      for nc in kernel_consts:
        line += ' ' + consts[nc]['name']+'_ompkernel,'
        if consts[nc]['dim'] != 1:
          if consts[nc]['dim'] > 0:
            num = str(consts[nc]['dim'])
          else:
            num = 'MAX_CONST_SIZE'
          line = line[:-1] + '[:'+ num +'],'
      line = line[:-1]+')'
    # prepare reduction
    reduction_string = ''
    reduction_mapping = ''
    if reduct:
      reduction_mapping ='\\\n'+(depth+2)*' '+ 'map(tofrom:'
      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
          if accs[g_m] == OP_INC:
            reduction_string += ' reduction(+:arg%d_l)' % g_m
            reduction_mapping += ' arg%d_l,' % g_m
          if accs[g_m] == OP_MIN:
            reduction_string += ' reduction(min:arg%d_l)' % g_m
            reduction_mapping += ' arg%d_l,' % g_m
          if accs[g_m] == OP_MAX:
            reduction_string += ' reduction(max:arg%d_l)' % g_m
            reduction_mapping += ' arg%d_l,' % g_m
          if accs[g_m] == OP_WRITE:
            reduction_mapping += ' arg%d_l,' % g_m
      reduction_mapping = reduction_mapping[0:-1]+')' 
#
# map extra pointers for indirect version
#
    if ninds>0:
      if maptype == 'map':
        line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],'
      else:
        line += '\\\n'+(depth+2)*' '+'map(to:col_reord,'
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
            k = k + [invmapinds[inds[g_m]-1]]
            if maptype == 'map':
              line = line + 'map'+str(mapinds[g_m])+'[0:map'+str(mapinds[g_m])+'size],'
            else:
              line = line + 'map'+str(mapinds[g_m])+','
      for m in range(1,ninds+1):
        g_m = invinds[m-1]
        if maptype == 'map':
          line = line + 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],'
        else:
          line = line + 'data'+str(g_m)+','
      line = line[:-1]+')'
#
# write omp pragma
#
    code(line + reduction_mapping + reduction_string)
    if op2_compiler != 'clang':
      line = '#pragma omp distribute parallel for schedule(static,1)'
      code(line + reduction_string)
#
# start for loop indirect version
#
    if ninds>0:
      FOR('e','start','end')
      code('int n_op = col_reord[e];')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')
      #do non-optional ones
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]):
            k = k + [mapinds[g_m]]
            code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\
              '[n_op + set_size1 * '+str(idxs[g_m])+'];')
      #do optional ones
      if nmaps > 0:
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            if optflags[g_m]:
              IF('optflags & 1<<'+str(optidxs[g_m]))
            else:
              k = k + [mapinds[g_m]]
            code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\
              '[n_op + set_size1 * '+str(idxs[g_m])+'];')
            if optflags[g_m]:
              ENDIF()

      code('')
      for g_m in range (0,nargs):
        u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
        if len(u) > 0 and vectorised[g_m] > 0:
          if accs[g_m] == OP_READ:
            line = 'const <TYP>* <ARG>_vec[] = {\n'
          else:
            line = '<TYP>* <ARG>_vec[] = {\n'

          v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
          first = [i for i in range(0,len(v)) if v[i] == 1]
          first = first[0]

          indent = ' '*(depth+2)
          for k in range(0,sum(v)):
            if soaflags[g_m]:
              line = line + indent + ' &data'+str(first)+'[map'+str(mapinds[g_m+k])+'idx],\n'
            else:
              line = line + indent + ' &data'+str(first)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n'
          line = line[:-2]+'};'
          code(line)
#
# direct version
#
    else:
      FOR('n_op','0','count')
#
# write inlined kernel function
#
    comm('variable mapping')
    for g_m in range(0,nargs):
      line = kernel_params[g_m] + ' = '
      if maps[g_m] == OP_ID:
        if soaflags[g_m]:
          line += '&data%d[n_op]' % g_m
        else:
          line += '&data'+str(g_m)+'['+str(dims[g_m])+'*n_op]'
      if maps[g_m] == OP_MAP:
        if vectorised[g_m]:
          if g_m+1 in unique_args:
            line += 'arg'+str(g_m)+'_vec'
          else:
            line = ''
        else:
          if soaflags[g_m]:
            line += '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]'
          else:
            line += '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
      if maps[g_m] == OP_GBL:
        line += '&arg%d_l' % g_m
      if len(line):
        line += ';'
        code(line)
    
    code('')
    comm('inline function')
    indent = ' ' * (depth-2)
    inline_body_text = ''
    for line in body_text.split('\n'):
      if len(line):
        inline_body_text += indent+line+'\n'
      else:
        inline_body_text += '\n'
    code(inline_body_text)
    comm('end inline func')

    ENDFOR()
    code('')
    # end kernel function
    for g_m in range(0, nargs):
      if maps[g_m] == OP_GBL:
        code('*<ARG> = <ARG>_l;')
    depth -= 2;
    code('}')


##########################################################################
#  output individual omp4kernel file
##########################################################################
    fid = open('openmp4/'+name+'_omp4kernel_func.cpp','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text =''
  comm(' header                 ')
  code('#include "op_lib_cpp.h"       ')
  code('')
  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_omp4kernel.cpp"')
  master = master.split('.')[0]
  fid = open('openmp4/'+master.split('.')[0]+'_omp4kernels.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()


##########################################################################
#  output omp4 master kernel file
##########################################################################

  file_text =''

  comm(' global constants       ')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel;')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'
      code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel['+num+'];')
  code('')

  comm(' header                 ')

  if os.path.exists('./user_types.h'):
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"       ')
  code('')

  code('void op_decl_const_char(int dim, char const *type,')
  code('  int size, char *dat, char const *name){')
  indent = ' ' * ( 2+ depth)
  line = '  ' 
  for nc in range (0,len(consts)):
    varname = consts[nc]['name']
    if nc > 0:
        line += ' else '
    line += 'if(!strcmp(name, "%s")) {\n' %varname + indent + 2*' ' + 'memcpy('
    if consts[nc]['dim']==1:
      line += '&'
    line += varname+ '_ompkernel, dat, dim*size);\n' + indent + '#pragma omp target enter data map(to:'+varname+'_ompkernel'
    if consts[nc]['dim'] !=1:
      line += '[:%s]' % str(consts[nc]['dim']) if consts[nc]['dim'] > 0 else 'MAX_CONST_SIZE'
    line += ')\n'+indent + '}'
  code(line)
  code('}')

  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_omp4kernel_func.cpp"')
  master = master.split('.')[0]
  fid = open('openmp4/'+master.split('.')[0]+'_omp4kernel_funcs.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()