示例#1
0
def op2_gen_openacc(master, date, consts, kernels):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  any_soa = 0
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
            ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
            unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]

#
# set two logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
        j = i
    reduct = j >= 0

##########################################################################
#  start with the user kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

    comm('user function')

    #strides for SoA
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;')
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;')
      dir_soa = -1
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1):
          code('int direct_'+name+'_stride_OP2CONSTANT;')
          code('int direct_'+name+'_stride_OP2HOST=-1;')
          dir_soa = g_m
          break

    comm('user function')
    file_name = decl_filepath

    f = open(file_name, 'r')
    kernel_text = f.read()
    f.close()

    kernel_text = op2_gen_common.comment_remover(kernel_text)
    kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

    p = re.compile('void\\s+\\b'+name+'\\b')
    i = p.search(kernel_text).start()

    if(i < 0):
      print "\n********"
      print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation"
      exit(2)
    i2 = i

    #i = kernel_text[0:i].rfind('\n') #reverse find
    j = kernel_text[i:].find('{')
    k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}')
    signature_text = kernel_text[i:i+j]
    l = signature_text[0:].find('(')
    head_text = signature_text[0:l].strip() #save function name
    m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
    signature_text = signature_text[l+1:m]
    body_text = kernel_text[i+j+1:k]

    # check for number of arguments
    if len(signature_text.split(',')) != nargs_novec:
        print 'Error parsing user kernel('+name+'): must have '+str(nargs_novec)+' arguments'
        return

    for i in range(0,nargs_novec):
        var = signature_text.split(',')[i].strip()
        if kernels[nk]['soaflags'][i]:
          var = var.replace('*','')
          #locate var in body and replace by adding [idx]
          length = len(re.compile('\\s+\\b').split(var))
          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()

          if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
            body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \
                    op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
          else:
            body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
            body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \
                    op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)

    signature_text = '//#pragma acc routine\ninline ' + head_text + '( '+signature_text + ') {'
    file_text += signature_text + body_text + '}\n'

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm(' host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg ARG){');
        code('')
      else:
        code('op_arg ARG,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL: #and accs[g_m] <> OP_READ:
        code('TYP*ARGh = (TYP *)ARG.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('ARG.idx = 0;')
        code('args['+str(g_m)+'] = ARG;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')
      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = ARG;')

#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('')

#
#   indirect bits
#
    if ninds>0:
      code('int  ninds   = '+str(ninds)+';')
      line = 'int  inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

      code('')
      comm(' get plan')
      code('#ifdef OP_PART_SIZE_'+ str(nk))
      code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
      code('#else')
      code('  int part_size = OP_part_size;')
      code('#endif')
      code('')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('op_mpi_halo_exchanges_cuda(set, nargs, args);')

    code('')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL: #and accs[g_m]<>OP_READ:
        if not dims[g_m].isdigit() or int(dims[g_m]) > 1:
          print 'ERROR: OpenACC does not support multi-dimensional op_arg_gbl variables'
          exit(-1)
        code('TYP ARG_l = ARGh[0];')

    if ninds > 0:
      code('')
      code('int ncolors = 0;')
    code('')
    IF('set->size >0')
    code('')
    #managing constants
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST;')
            ENDIF()
      if dir_soa<>-1:
          IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))')
          code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');')
          code('direct_'+name+'_stride_OP2CONSTANT = direct_'+name+'_stride_OP2HOST;')
          ENDIF()
    code('')
    comm('Set up typed device pointers for OpenACC')
    if nmaps > 0:
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
          k = k + [invmapinds[inds[g_m]-1]]
          code('int *map'+str(mapinds[g_m])+' = arg'+str(invmapinds[inds[g_m]-1])+'.map_data_d;')

    code('')
    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        code(typs[g_m]+'* data'+str(g_m)+' = ('+typs[g_m]+'*)arg'+str(g_m)+'.data_d;')

    for m in range(1,ninds+1):
      g_m = invinds[m-1]
      code('TYP *data'+str(g_m)+' = (TYP *)ARG.data_d;')

#
# kernel call for indirect version
#
    if ninds>0:
      code('')
      code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);')
      code('ncolors = Plan->ncolors;')
      code('int *col_reord = Plan->col_reord;')
      code('int set_size1 = set->size + set->exec_size;')
      code('')
      comm(' execute plan')
      FOR('col','0','Plan->ncolors')
      IF('col==1')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()
      code('int start = Plan->col_offsets[0][col];')
      code('int end = Plan->col_offsets[0][col+1];')
      code('')
#      code('#pragma omp parallel for')
      line = '#pragma acc parallel loop independent deviceptr(col_reord,'
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
            k = k + [invmapinds[inds[g_m]-1]]
            line = line + 'map'+str(mapinds[g_m])+','

      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line+'data'+str(g_m)+','
      for m in range(1,ninds+1):
        g_m = invinds[m-1]
        line = line + 'data'+str(g_m)+','
      line = line[:-1]+')'

      if reduct:
        for g_m in range(0,nargs):
          if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE:
            if accs[g_m] == OP_INC:
              line = line + ' reduction(+:arg'+str(g_m)+'_l)'
            if accs[g_m] == OP_MIN:
              line = line + ' reduction(min:arg'+str(g_m)+'_l)'
            if accs[g_m] == OP_MAX:
              line = line + ' reduction(max:arg'+str(g_m)+'_l)'
      code(line)
      FOR('e','start','end')
      code('int n = col_reord[e];')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\
              '[n + set_size1 * '+str(idxs[g_m])+'];')

      code('')
      for g_m in range (0,nargs):
        u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
        if len(u) > 0 and vectorised[g_m] > 0:
          if accs[g_m] == OP_READ:
            line = 'const TYP* ARG_vec[] = {\n'
          else:
            line = 'TYP* ARG_vec[] = {\n'

          v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
          first = [i for i in range(0,len(v)) if v[i] == 1]
          first = first[0]

          indent = ' '*(depth+2)
          for k in range(0,sum(v)):
            if soaflags[g_m]:
              line = line + indent + ' &data'+str(first)+'[map'+str(mapinds[g_m+k])+'idx],\n'
            else:
              line = line + indent + ' &data'+str(first)+'[DIM * map'+str(mapinds[g_m+k])+'idx],\n'
          line = line[:-2]+'};'
          code(line)
      code('')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          if soaflags[g_m]:
            line = line + indent + '&data'+str(g_m)+'[n]'
          else:
            line = line + indent + '&data'+str(g_m)+'['+str(dims[g_m])+' * n]'
        if maps[g_m] == OP_MAP:
          if vectorised[g_m]:
            if g_m+1 in unique_args:
              line = line + indent + 'arg'+str(g_m)+'_vec'
          else:
            if soaflags[g_m]:
              line = line + indent + '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]'
            else:
              line = line + indent + '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'&arg'+str(g_m)+'_l'
        if g_m < nargs-1:
          if g_m+1 in unique_args and not g_m+1 == unique_args[-1]:
            line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()
      code('')

      if reduct:
        comm(' combine reduction data')
        IF('col == Plan->ncolors_owned-1')
        for g_m in range(0,nargs):
          if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
            if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE:
              code('ARGh[0] = ARG_l;')
            elif accs[g_m]==OP_MIN:
              code('ARGh[0]  = MIN(ARGh[0],ARG_l);')
              ENDFOR()
            elif  accs[g_m]==OP_MAX:
              code('ARGh[0]  = MAX(ARGh[0],ARG_l);')
            else:
              error('internal error: invalid reduction option')
            ENDFOR()
        ENDIF()
      ENDFOR()

#
# kernel call for direct version
#
    else:
      line = '#pragma acc parallel loop independent deviceptr('
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line+'data'+str(g_m)+','
      line = line[:-1]+')'

      if reduct:
        for g_m in range(0,nargs):
          if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
            if accs[g_m] == OP_INC:
              line = line + ' reduction(+:arg'+str(g_m)+'_l)'
            if accs[g_m] == OP_MIN:
              line = line + ' reduction(min:arg'+str(g_m)+'_l)'
            if accs[g_m] == OP_MAX:
              line = line + ' reduction(max:arg'+str(g_m)+'_l)'
      code(line)
      FOR('n','0','set->size')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          if soaflags[g_m]:
            line = line + indent + '&data'+str(g_m)+'[n]'
          else:
            line = line + indent + '&data'+str(g_m)+'['+str(dims[g_m])+'*n]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'&arg'+str(g_m)+'_l'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()

    if ninds>0:
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')

    ENDIF()
    code('')

    #zero set size issues
    if ninds>0:
      IF('set_size == 0 || set_size == set->core_size || ncolors == 1')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
    comm(' combine reduction data')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
        if ninds==0: #direct version only
          if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE:
            code('ARGh[0] = ARG_l;')
          elif accs[g_m]==OP_MIN:
            code('ARGh[0]  = MIN(ARGh[0],ARG_l);')
          elif accs[g_m]==OP_MAX:
            code('ARGh[0]  = MAX(ARGh[0],ARG_l);')
          else:
            print 'internal error: invalid reduction option'
        if typs[g_m] == 'double': #need for both direct and indirect
          code('op_mpi_reduce_double(&ARG,ARGh);')
        elif typs[g_m] == 'float':
          code('op_mpi_reduce_float(&ARG,ARGh);')
        elif typs[g_m] == 'int':
          code('op_mpi_reduce_int(&ARG,ARGh);')
        else:
          print 'Type '+typs[g_m]+' not supported in OpenACC code generator, please add it'
          exit(-1)


    code('op_mpi_set_dirtybit_cuda(nargs, args);')
    code('')

#
# update kernel record
#

    comm(' update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('ARG.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' ARG.size;')
          else:
            code(line+' ARG.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()
    depth -= 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('openacc'):
        os.makedirs('openacc')
    fid = open('openacc/'+name+'_acckernel.c','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text =''

  comm(' global constants       ')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'

      code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];')
  code('')

  comm(' header                 ')

  if os.path.exists('./user_types.h'):
    code('#include "../user_types.h"')
  code('#include "op_lib_c.h"       ')
  code('')

  code('void op_decl_const_char(int dim, char const *type,')
  code('int size, char *dat, char const *name){}')

  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_acckernel.c"')
  master = master.split('.')[0]
  fid = open('openacc/'+master.split('.')[0]+'_acckernels.c','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
示例#2
0
def op2_gen_mpi_vec(master, date, consts, kernels):

    global dims, idxs, typs, indtyps, inddims
    global FORTRAN, CPP, g_m, file_text, depth

    OP_ID = 1
    OP_GBL = 2
    OP_MAP = 3

    OP_READ = 1
    OP_WRITE = 2
    OP_RW = 3
    OP_INC = 4
    OP_MAX = 5
    OP_MIN = 6

    accsstring = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', 'OP_MAX', 'OP_MIN']

    any_soa = 0
    for nk in range(0, len(kernels)):
        any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

    for nk in range(0, len(kernels)):

        name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
                ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
                unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])
        #
        # set three logicals
        #
        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_MAP and accs[i] == OP_INC:
                j = i
        ind_inc = j >= 0

        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_GBL and accs[i] <> OP_READ:
                j = i
        reduct = j >= 0

        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_MAP:
                j = i
        indirect_kernel = j >= 0

        ####################################################################################
        #  generate the user kernel function - creating versions for vectorisation as needed
        ####################################################################################

        FORTRAN = 0
        CPP = 1
        g_m = 0
        file_text = ''
        depth = 0

        #
        # First original version
        #
        comm('user function')
        file_name = decl_filepath

        f = open(file_name, 'r')
        kernel_text = f.read()
        file_text += kernel_text
        f.close()

        #
        # Modified vectorisable version if its an indirect kernel
        # - direct kernels can be vectorised without modification
        #
        if indirect_kernel:
            code('#ifdef VECTORIZE')
            comm('user function -- modified for vectorisation')
            f = open(file_name, 'r')
            kernel_text = f.read()
            f.close()

            kernel_text = op2_gen_common.comment_remover(kernel_text)
            kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

            p = re.compile('void\\s+\\b' + name + '\\b')
            i = p.search(kernel_text).start()

            if (i < 0):
                print "\n********"
                print "Error: cannot locate user kernel function name: " + name + " - Aborting code generation"
                exit(2)
            i2 = i

            #i = kernel_text[0:i].rfind('\n') #reverse find
            j = kernel_text[i:].find('{')
            k = op2_gen_common.para_parse(kernel_text, i + j, '{', '}')
            signature_text = kernel_text[i:i + j]
            l = signature_text[0:].find('(')
            head_text = signature_text[0:l]  #save function name
            m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
            signature_text = signature_text[l + 1:m]
            body_text = kernel_text[i + j + 1:k]

            ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>:
            body_text = op2_gen_common.replace_local_includes_with_file_contents(
                body_text, os.path.dirname(master))

            # check for number of arguments
            nargs_actual = len(signature_text.split(','))
            if nargs_actual != nargs:
                print(
                    'Error parsing user kernel({0}): must have {1} arguments (instead it has {2})'
                    .format(name, nargs, nargs_actual))
                return

            new_signature_text = ''
            for i in range(0, nargs):
                var = signature_text.split(',')[i].strip()

                if maps[i] <> OP_GBL and maps[i] <> OP_ID:
                    #remove * and add [*][SIMD_VEC]
                    var = var.replace('*', '')
                    #locate var in body and replace by adding [idx]
                    length = len(re.compile('\\s+\\b').split(var))
                    var2 = re.compile('\\s+\\b').split(var)[length - 1].strip()

                    #print var2

                    body_text = re.sub('\*\\b' + var2 + '\\b\\s*(?!\[)',
                                       var2 + '[0]', body_text)
                    body_text = re.sub(
                        r'(' + var2 + '\[[\w\(\)\+\-\*\s\\\\]*\]' + ')',
                        r'\1' + '[idx]', body_text)

                    var = var + '[*][SIMD_VEC]'
                    #var = var + '[restrict][SIMD_VEC]'
                new_signature_text += var + ', '

            #add ( , idx and )
            signature_text = "inline " + head_text + '( ' + new_signature_text + 'int idx ) {'
            #finally update name
            signature_text = signature_text.replace(name, name + '_vec')

            #print head_text
            #print signature_text
            #print  body_text

            file_text += signature_text + body_text + '}\n'
            code('#endif')

##########################################################################
# then C++ stub function
##########################################################################

        code('')
        comm(' host stub function')
        code('void op_par_loop_' + name + '(char const *name, op_set set,')
        depth += 2

        for m in unique_args:
            g_m = m - 1
            if m == unique_args[len(unique_args) - 1]:
                code('op_arg <ARG>){')
                code('')
            else:
                code('op_arg <ARG>,')

        code('int nargs = ' + str(nargs) + ';')
        code('op_arg args[' + str(nargs) + '];')
        code('')

        for g_m in range(0, nargs):
            u = [
                i for i in range(0, len(unique_args))
                if unique_args[i] - 1 == g_m
            ]
            if len(u) > 0 and vectorised[g_m] > 0:
                code('<ARG>.idx = 0;')
                code('args[' + str(g_m) + '] = <ARG>;')

                v = [
                    int(vectorised[i] == vectorised[g_m])
                    for i in range(0, len(vectorised))
                ]
                first = [i for i in range(0, len(v)) if v[i] == 1]
                first = first[0]
                if (optflags[g_m] == 1):
                    argtyp = 'op_opt_arg_dat(arg' + str(first) + '.opt, '
                else:
                    argtyp = 'op_arg_dat('

                FOR('v', '1', str(sum(v)))
                code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
                str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
                ENDFOR()
                code('')
            elif vectorised[g_m] > 0:
                pass
            else:
                code('args[' + str(g_m) + '] = <ARG>;')

#
# create aligned pointers
#
        comm('create aligned pointers for dats')
        for g_m in range(0, nargs):
            if maps[g_m] <> OP_GBL:
                if (accs[g_m] == OP_INC or accs[g_m] == OP_RW
                        or accs[g_m] == OP_WRITE):
                    code('ALIGNED_<TYP>       <TYP> * __restrict__ ptr'+\
                    str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
                    #code('<TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN)))  ptr'+\
                    #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
                    code('__assume_aligned(ptr' + str(g_m) + ',<TYP>_ALIGN);')

                else:
                    code('ALIGNED_<TYP> const <TYP> * __restrict__ ptr'+\
                    str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
                    code('__assume_aligned(ptr' + str(g_m) + ',<TYP>_ALIGN);')
                    #code('const <TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN)))  ptr'+\
                    #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')

#
# start timing
#
        code('')
        comm(' initialise timers')
        code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
        code('op_timing_realloc(' + str(nk) + ');')
        code('op_timers_core(&cpu_t1, &wall_t1);')
        code('')

        #
        #   indirect bits
        #
        if ninds > 0:
            IF('OP_diags>2')
            code('printf(" kernel routine with indirection: ' + name +
                 '\\n");')
            ENDIF()

#
# direct bit
#
        else:
            code('')
            IF('OP_diags>2')
            code('printf(" kernel routine w/o indirection:  ' + name + '");')
            ENDIF()

        code('')
        code('int exec_size = op_mpi_halo_exchanges(set, nargs, args);')

        code('')
        IF('exec_size >0')
        code('')

        #
        # kernel call for indirect version
        #
        if ninds > 0:
            code('#ifdef VECTORIZE')

            code('#pragma novector')
            FOR2('n', '0', '(exec_size/SIMD_VEC)*SIMD_VEC', 'SIMD_VEC')
            #initialize globals
            for g_m in range(0, nargs):
                if maps[g_m] == OP_GBL:
                    code('<TYP> dat{0}[SIMD_VEC];'.format(g_m))
                    FOR('i', '0', 'SIMD_VEC')
                    if accs[g_m] == OP_INC:
                        code('dat{0}[i] = 0.0;'.format(g_m))
                    elif accs[g_m] == OP_MAX:
                        code('dat{0}[i] = -INFINITY;'.format(g_m))
                    elif accs[g_m] == OP_MIN:
                        code('dat{0}[i] = INFINITY;'.format(g_m))
                    elif accs[g_m] == OP_READ:
                        code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m))
                    ENDFOR()

            IF('n+SIMD_VEC >= set->core_size')
            code('op_mpi_wait_all(nargs, args);')
            ENDIF()
            for g_m in range(0, nargs):
                if maps[g_m] == OP_MAP and (accs[g_m] == OP_READ \
                  or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE \
                  or accs[g_m] == OP_INC):
                    code('ALIGNED_<TYP> <TYP> dat' + str(g_m) +
                         '[<DIM>][SIMD_VEC];')

            #setup gathers
            code('#pragma omp simd simdlen(SIMD_VEC)')
            FOR('i', '0', 'SIMD_VEC')
            if nmaps > 0:
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP:
                        if (accs[g_m] == OP_READ or accs[g_m] == OP_RW
                                or accs[g_m]
                                == OP_WRITE):  #and (not mapinds[g_m] in k):
                            code('int idx' + str(g_m) +
                                 '_<DIM> = <DIM> * arg' +
                                 str(invmapinds[inds[g_m] - 1]) +
                                 '.map_data[(n+i) * arg' +
                                 str(invmapinds[inds[g_m] - 1]) +
                                 '.map->dim + ' + str(idxs[g_m]) + '];')
            code('')
            for g_m in range(0, nargs):
                if maps[g_m] == OP_MAP:
                    if (accs[g_m] == OP_READ or accs[g_m]
                            == OP_RW):  #and (not mapinds[g_m] in k):
                        for d in range(0, int(dims[g_m])):
                            code('dat' + str(g_m) + '[' + str(d) +
                                 '][i] = (ptr' + str(g_m) + ')[idx' +
                                 str(g_m) + '_<DIM> + ' + str(d) + '];')
                        code('')
                    elif (accs[g_m] == OP_INC):
                        for d in range(0, int(dims[g_m])):
                            code('dat' + str(g_m) + '[' + str(d) +
                                 '][i] = 0.0;')
                        code('')
                else:  #globals
                    if (accs[g_m] == OP_INC):
                        # for d in range(0,int(dims[g_m])):
                        #   code('dat'+str(g_m)+'[i] = 0.0;')
                        # code('')
                        pass

            ENDFOR()
            #kernel call
            code('#pragma omp simd simdlen(SIMD_VEC)')
            FOR('i', '0', 'SIMD_VEC')
            line = name + '_vec('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * (n+i)],'
                elif maps[g_m] == OP_GBL and accs[g_m] == OP_READ:
                    line = line + indent + '(' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data,'
                elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC:
                    line = line + indent + '&dat' + str(g_m) + '[i],'
                else:
                    line = line + indent + 'dat' + str(g_m) + ','
            line = line + indent + 'i);'
            code(line)
            ENDFOR()
            #do the scatters
            FOR('i', '0', 'SIMD_VEC')
            if nmaps > 0:
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP:
                        if (accs[g_m] == OP_INC or accs[g_m] == OP_RW
                                or accs[g_m]
                                == OP_WRITE):  #and (not mapinds[g_m] in k):
                            code('int idx' + str(g_m) +
                                 '_<DIM> = <DIM> * arg' +
                                 str(invmapinds[inds[g_m] - 1]) +
                                 '.map_data[(n+i) * arg' +
                                 str(invmapinds[inds[g_m] - 1]) +
                                 '.map->dim + ' + str(idxs[g_m]) + '];')
            code('')
            for g_m in range(0, nargs):
                if maps[g_m] == OP_MAP:
                    if (accs[g_m] == OP_INC):
                        for d in range(0, int(dims[g_m])):
                            code('(ptr' + str(g_m) + ')[idx' + str(g_m) +
                                 '_<DIM> + ' + str(d) + '] += dat' + str(g_m) +
                                 '[' + str(d) + '][i];')
                        code('')
                    if (accs[g_m] == OP_WRITE or accs[g_m] == OP_RW):
                        for d in range(0, int(dims[g_m])):
                            code('(ptr' + str(g_m) + ')[idx' + str(g_m) +
                                 '_<DIM> + ' + str(d) + '] = dat' + str(g_m) +
                                 '[' + str(d) + '][i];')
                        code('')
            ENDFOR()

            #do reductions
            for g_m in range(0, nargs):
                if maps[g_m] == OP_GBL:
                    FOR('i', '0', 'SIMD_VEC')
                    if accs[g_m] == OP_INC:
                        code('*(<TYP>*)arg' + str(g_m) + '.data += dat' +
                             str(g_m) + '[i];')
                    elif accs[g_m] == OP_MAX:
                        code('*(<TYP>*)arg' + str(g_m) +
                             '.data = MAX(*(<TYP>*)arg' + str(g_m) +
                             '.data,dat' + str(g_m) + '[i]);')
                    elif accs[g_m] == OP_MIN:
                        code('*(<TYP>*)arg' + str(g_m) +
                             '.data = MIN(*(<TYP>*)arg' + str(g_m) +
                             '.data,dat' + str(g_m) + '[i]);')
                    ENDFOR()

            ENDFOR()
            code('')
            comm('remainder')
            FOR('n', '(exec_size/SIMD_VEC)*SIMD_VEC', 'exec_size')
            depth = depth - 2
            code('#else')
            FOR('n', '0', 'exec_size')
            depth = depth - 2
            code('#endif')
            depth = depth + 2
            IF('n==set->core_size')
            code('op_mpi_wait_all(nargs, args);')
            ENDIF()
            if nmaps > 0:
                k = []
                #print name
                #print maps
                #print mapinds
                for g_m in range(0, nargs):
                    #print g_m
                    if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
                        k = k + [mapinds[g_m]]
                        code('int map' + str(mapinds[g_m]) + 'idx = arg' +
                             str(invmapinds[inds[g_m] - 1]) +
                             '.map_data[n * arg' +
                             str(invmapinds[inds[g_m] - 1]) + '.map->dim + ' +
                             str(idxs[g_m]) + '];')
            code('')
            line = name + '('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * n]'
                if maps[g_m] == OP_MAP:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * map' + str(mapinds[g_m]) + 'idx]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '(' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data'
                if g_m < nargs - 1:
                    line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()

#
# kernel call for direct version
#
        else:
            code('#ifdef VECTORIZE')

            code('#pragma novector')
            FOR2('n', '0', '(exec_size/SIMD_VEC)*SIMD_VEC', 'SIMD_VEC')

            #initialize globals
            for g_m in range(0, nargs):
                if maps[g_m] == OP_GBL:
                    code('<TYP> dat{0}[SIMD_VEC];'.format(g_m))
                    FOR('i', '0', 'SIMD_VEC')
                    if accs[g_m] == OP_INC:
                        code('dat{0}[i] = 0.0;'.format(g_m))
                    elif accs[g_m] == OP_MAX:
                        code('dat{0}[i] = -INFINITY;'.format(g_m))
                    elif accs[g_m] == OP_MIN:
                        code('dat{0}[i] = INFINITY;'.format(g_m))
                    elif accs[g_m] == OP_READ:
                        code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m))
                    ENDFOR()

            code('#pragma omp simd simdlen(SIMD_VEC)')
            FOR('i', '0', 'SIMD_VEC')
            line = name + '('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * (n+i)]'
                if maps[g_m] == OP_MAP:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + ' * map' + str(mapinds[g_m]) + 'idx]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '&dat' + str(g_m) + '[i]'
                if g_m < nargs - 1:
                    line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()
            #do reductions
            for g_m in range(0, nargs):
                if maps[g_m] == OP_GBL:
                    FOR('i', '0', 'SIMD_VEC')
                    if accs[g_m] == OP_INC:
                        code('*(<TYP>*)arg' + str(g_m) + '.data += dat' +
                             str(g_m) + '[i];')
                    elif accs[g_m] == OP_MAX:
                        code('*(<TYP>*)arg' + str(g_m) +
                             '.data = MAX(*(<TYP>*)arg' + str(g_m) +
                             '.data,dat' + str(g_m) + '[i]);')
                    elif accs[g_m] == OP_MIN:
                        code('*(<TYP>*)arg' + str(g_m) +
                             '.data = MIN(*(<TYP>*)arg' + str(g_m) +
                             '.data,dat' + str(g_m) + '[i]);')
                    ENDFOR()
            ENDFOR()

            comm('remainder')
            FOR('n', '(exec_size/SIMD_VEC)*SIMD_VEC', 'exec_size')
            depth = depth - 2
            code('#else')
            FOR('n', '0', 'exec_size')
            depth = depth - 2
            code('#endif')
            depth = depth + 2
            line = name + '('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&(ptr' + str(g_m) + ')[' + str(
                        dims[g_m]) + '*n]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '(' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data'
                if g_m < nargs - 1:
                    line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()
        ENDIF()
        code('')

        #zero set size issues
        if ninds > 0:
            IF('exec_size == 0 || exec_size == set->core_size')
            code('op_mpi_wait_all(nargs, args);')
            ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
        comm(' combine reduction data')
        for g_m in range(0, nargs):
            if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
                code('op_mpi_reduce(&<ARG>,(' + typs[g_m] + '*)<ARG>.data);')

        code('op_mpi_set_dirtybit(nargs, args);')
        code('')

        #
        # update kernel record
        #

        comm(' update kernel record')
        code('op_timers_core(&cpu_t2, &wall_t2);')
        code('OP_kernels[' + str(nk) + '].name      = name;')
        code('OP_kernels[' + str(nk) + '].count    += 1;')
        code('OP_kernels[' + str(nk) + '].time     += wall_t2 - wall_t1;')

        if ninds == 0:
            line = 'OP_kernels[' + str(nk) + '].transfer += (float)set->size *'

            for g_m in range(0, nargs):
                if maps[g_m] <> OP_GBL:
                    if accs[g_m] == OP_READ:
                        code(line + ' <ARG>.size;')
                    else:
                        code(line + ' <ARG>.size * 2.0f;')
        else:
            names = []
            for g_m in range(0, ninds):
                mult = ''
                if indaccs[g_m] <> OP_WRITE and indaccs[g_m] <> OP_READ:
                    mult = ' * 2.0f'
                if not var[invinds[g_m]] in names:
                    code('OP_kernels[' + str(nk) +
                         '].transfer += (float)set->size * arg' +
                         str(invinds[g_m]) + '.size' + mult + ';')
                    names = names + [var[invinds[g_m]]]
            for g_m in range(0, nargs):
                mult = ''
                if accs[g_m] <> OP_WRITE and accs[g_m] <> OP_READ:
                    mult = ' * 2.0f'
                if not var[g_m] in names:
                    names = names + [var[invinds[g_m]]]
                    if maps[g_m] == OP_ID:
                        code('OP_kernels[' + str(nk) +
                             '].transfer += (float)set->size * arg' +
                             str(g_m) + '.size' + mult + ';')
                    elif maps[g_m] == OP_GBL:
                        code('OP_kernels[' + str(nk) +
                             '].transfer += (float)set->size * arg' +
                             str(g_m) + '.size' + mult + ';')
            if nmaps > 0:
                k = []
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
                        k = k + [mapnames[g_m]]
                        code('OP_kernels[' + str(nk) +
                             '].transfer += (float)set->size * arg' +
                             str(invinds[inds[g_m] - 1]) + '.map->dim * 4.0f;')

        depth -= 2
        code('}')

        ##########################################################################
        #  output individual kernel file
        ##########################################################################
        if not os.path.exists('vec'):
            os.makedirs('vec')
        fid = open('vec/' + name + '_veckernel.cpp', 'w')
        date = datetime.datetime.now()
        #fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n')
        fid.write('//\n// auto-generated by op2.py\n//\n\n')
        fid.write(file_text)
        fid.close()


# end of main kernel call loop

##########################################################################
#  output one master kernel file
##########################################################################

    file_text = ''

    code('#define double_ALIGN 128')
    code('#define float_ALIGN 64')
    code('#define int_ALIGN 64')
    code('#ifdef VECTORIZE')
    code('#define SIMD_VEC 4')
    code('#define ALIGNED_double __attribute__((aligned(double_ALIGN)))')
    code('#define ALIGNED_float __attribute__((aligned(float_ALIGN)))')
    code('#define ALIGNED_int __attribute__((aligned(int_ALIGN)))')
    code('#else')
    code('#define ALIGNED_double')
    code('#define ALIGNED_float')
    code('#define ALIGNED_int')
    code('#endif')
    code('')

    comm(' global constants       ')

    for nc in range(0, len(consts)):
        if not consts[nc]['user_declared']:
            if consts[nc]['dim'] == 1:
                code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                     consts[nc]['name'] + ';')
            else:
                if consts[nc]['dim'] > 0:
                    num = str(consts[nc]['dim'])
                else:
                    num = 'MAX_CONST_SIZE'
                code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                     consts[nc]['name'] + '[' + num + '];')
    code('')

    comm(' header                 ')

    if os.path.exists('./user_types.h'):
        code('#include "../user_types.h"')
    code('#include "op_lib_cpp.h"')
    code('')

    comm(' user kernel files')

    for nk in range(0, len(kernels)):
        code('#include "' + kernels[nk]['name'] + '_veckernel.cpp"')
    master = master.split('.')[0]
    fid = open('vec/' + master.split('.')[0] + '_veckernels.cpp', 'w')
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()
示例#3
0
def op2_gen_mpi_vec(master, date, consts, kernels):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  grouped = 0

  any_soa = 0
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
            ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
            unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])
#
# set three logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] != OP_READ:
        j = i
    reduct = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP :
        j = i
    indirect_kernel = j >= 0

    if nargs != nargs_novec:
      return
####################################################################################
#  generate the user kernel function - creating versions for vectorisation as needed
####################################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

#
# First original version
#
    comm('user function')
    file_name = decl_filepath

    f = open(file_name, 'r')
    kernel_text = f.read()
    file_text += kernel_text
    f.close()

    ## Clang compiler can struggle to vectorize a loop if it uses a mix of
    ## Python-generated simd arrays for indirect data AND pointers to direct
    ## data. Fix by also generating simd arrays for direct data:
    do_gen_direct_simd_arrays = True

#
# Modified vectorisable version if its an indirect kernel
# - direct kernels can be vectorised without modification
#
    if indirect_kernel:
      code('#ifdef VECTORIZE')
      comm('user function -- modified for vectorisation')
      f = open(file_name, 'r')
      kernel_text = f.read()
      f.close()

      kernel_text = op2_gen_common.comment_remover(kernel_text)
      kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

      p = re.compile('void\\s+\\b'+name+'\\b')
      i = p.search(kernel_text).start()

      if(i < 0):
        print("\n********")
        print("Error: cannot locate user kernel function name: "+name+" - Aborting code generation")
        exit(2)
      i2 = i

      #i = kernel_text[0:i].rfind('\n') #reverse find
      j = kernel_text[i:].find('{')
      k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}')
      signature_text = kernel_text[i:i+j]
      l = signature_text[0:].find('(')
      head_text = signature_text[0:l] #save function name
      m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
      signature_text = signature_text[l+1:m]
      body_text = kernel_text[i+j+1:k]

      ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>:
      body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master))


      # check for number of arguments
      nargs_actual = len(signature_text.split(','))
      if nargs_actual != nargs:
          print(('Error parsing user kernel({0}): must have {1} arguments (instead it has {2})'.format(name, nargs, nargs_actual)))
          return

      new_signature_text = ''
      for i in range(0,nargs):
        var = signature_text.split(',')[i].strip()

        if do_gen_direct_simd_arrays:
          do_gen_simd_array_arg = maps[i] != OP_GBL
        else:
          do_gen_simd_array_arg = maps[i] != OP_GBL and maps[i] != OP_ID
        if do_gen_simd_array_arg:
          #remove * and add [*][SIMD_VEC]
          var = var.replace('*','')
          #locate var in body and replace by adding [idx]
          length = len(re.compile('\\s+\\b').split(var))
          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()

          #print var2

          body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
          array_access_pattern = '\[[\w\(\)\+\-\*\s\\\\]*\]'

          ## It has been observed that vectorisation can fail on loops with increments,
          ## but replacing them with writes succeeds.
          ## For example with Clang on particular loops, vectorisation fails with message:
          ##   "loop not vectorized: loop control flow is not understood by vectorizer"
          ## replacing increments with writes solves this.
          ## Replacement is data-safe due to use of local/intermediate SIMD arrays.
          ## Hopefully the regex is matching all increments.
          ## And for loops that were being vectorised, this change can give a small perf boost.
          if maps[i] == OP_MAP and accs[i] == OP_INC:
            ## Replace 'var' increments with writes:
            body_text = re.sub(r'('+var2+array_access_pattern+'\s*'+')'+re.escape("+="), r'\1'+'=', body_text)

          ## Append vector array access:
          body_text = re.sub(r'('+var2+array_access_pattern+')', r'\1'+'[idx]', body_text)

          var = var + '[][SIMD_VEC]'
          #var = var + '[restrict][SIMD_VEC]'
        new_signature_text +=  var+', '


      #add ( , idx and )
      signature_text = "#if defined __clang__ || defined __GNUC__\n"
      signature_text += "__attribute__((always_inline))\n"
      signature_text += "#endif\n"
      signature_text += "inline " + head_text + '( '+new_signature_text + 'int idx ) {'
      #finally update name
      signature_text = signature_text.replace(name,name+'_vec')

      #print head_text
      #print signature_text
      #print  body_text

      file_text += signature_text + body_text + '}\n'
      code('#endif');



##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm(' host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg <ARG>){');
        code('')
      else:
        code('op_arg <ARG>,')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('<ARG>.idx = 0;')
        code('args['+str(g_m)+'] = <ARG>;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')
      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = <ARG>;')

#
# create aligned pointers
#
    comm('create aligned pointers for dats')
    for g_m in range (0,nargs):
        if maps[g_m] != OP_GBL:
          if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE):
            code('ALIGNED_<TYP>       <TYP> * __restrict__ ptr'+\
            str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
            #code('<TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN)))  ptr'+\
            #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
            code('DECLARE_PTR_ALIGNED(ptr'+str(g_m)+',<TYP>_ALIGN);')

          else:
            code('ALIGNED_<TYP> const <TYP> * __restrict__ ptr'+\
            str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')
            code('DECLARE_PTR_ALIGNED(ptr'+str(g_m)+',<TYP>_ALIGN);')
            #code('const <TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN)))  ptr'+\
            #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;')



#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('')

#
#   indirect bits
#
    if ninds>0:
      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()

    code('')
    if grouped:
      code('int exec_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 1);')
    else:
      code('int exec_size = op_mpi_halo_exchanges(set, nargs, args);')

    code('')
    IF('exec_size >0')
    code('')

#
# kernel call for indirect version
#
    if ninds>0:
      code('#ifdef VECTORIZE')

      code('#pragma novector')
      FOR2('n','0','(exec_size/SIMD_VEC)*SIMD_VEC','SIMD_VEC')
      #initialize globals
      for g_m in range(0,nargs):
        if maps[g_m] == OP_GBL:
          code('<TYP> dat{0}[SIMD_VEC];'.format(g_m))
          FOR('i','0','SIMD_VEC')
          if accs[g_m] == OP_INC:
            code('dat{0}[i] = 0.0;'.format(g_m))
          elif accs[g_m] == OP_MAX:
            code('dat{0}[i] = -INFINITY;'.format(g_m))
          elif accs[g_m] == OP_MIN:
            code('dat{0}[i] = INFINITY;'.format(g_m))
          elif accs[g_m] == OP_READ:
            code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m))
          ENDFOR()

      code('if (n<set->core_size && n>0 && n % OP_mpi_test_frequency == 0)')
      code('  op_mpi_test_all(nargs,args);')
      IF('(n+SIMD_VEC >= set->core_size) && (n+SIMD_VEC-set->core_size < SIMD_VEC)')
      if grouped:
        code('op_mpi_wait_all_grouped(nargs, args, 1);')
      else:
        code('op_mpi_wait_all(nargs, args);')
      ENDIF()
      for g_m in range(0,nargs):
        if do_gen_direct_simd_arrays:
          if (maps[g_m] in [OP_MAP, OP_ID]) and (accs[g_m] in [OP_READ, OP_RW, OP_WRITE, OP_INC]):
            code('ALIGNED_<TYP> <TYP> dat'+str(g_m)+'[<DIM>][SIMD_VEC];')
        else:
          if maps[g_m] == OP_MAP and (accs[g_m] in [OP_READ, OP_RW, OP_WRITE, OP_INC]):
            code('ALIGNED_<TYP> <TYP> dat'+str(g_m)+'[<DIM>][SIMD_VEC];')

      #setup gathers
      idx_map_template = "int idx{0}_<DIM> = <DIM> * arg{1}.map_data[(n+i) * arg{1}.map->dim + {2}];"
      idx_id_template  = "int idx{0}_<DIM> = <DIM> * (n+i);"
      code('#pragma omp simd simdlen(SIMD_VEC)')
      FOR('i','0','SIMD_VEC')
      if nmaps > 0:
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP :
            if (accs[g_m] in [OP_READ, OP_RW, OP_WRITE]):#and (not mapinds[g_m] in k):
              code(idx_map_template.format(g_m, invmapinds[inds[g_m]-1], idxs[g_m]))
          elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID :
            code(idx_id_template.format(g_m))
      code('')

      init_dat_template = "dat{0}[{1}][i] = (ptr{0})[idx{0}_<DIM> + {1}];"
      zero_dat_template = "dat{0}[{1}][i] = 0.0;"
      for g_m in range(0,nargs):
        if do_gen_direct_simd_arrays:
          ## also 'gather' directly-accessed data, because SOME compilers
          ## struggle to vectorise otherwise (e.g. Clang).
          if maps[g_m] != OP_GBL :
            if accs[g_m] in [OP_READ, OP_RW]:
              for d in range(0,int(dims[g_m])):
                code(init_dat_template.format(g_m, d))
              code('')
            elif accs[g_m] == OP_INC:
              for d in range(0,int(dims[g_m])):
                code(zero_dat_template.format(g_m, d))
              code('')
        else:
          if maps[g_m] == OP_MAP :
            if accs[g_m] in [OP_READ, OP_RW]:#and (not mapinds[g_m] in k):
              for d in range(0,int(dims[g_m])):
                init_dat_str = init_dat_template.format(g_m, d)
                code(init_dat_str)
              code('')
            elif (accs[g_m] == OP_INC):
              for d in range(0,int(dims[g_m])):
                zero_dat_str = zero_dat_template.format(g_m, d)
                code(zero_dat_str)
              code('')
          else: #globals
            if (accs[g_m] == OP_INC):
              # for d in range(0,int(dims[g_m])):
              #   code('dat'+str(g_m)+'[i] = 0.0;')
              # code('')
              pass

      ENDFOR()
      #kernel call
      code('#pragma omp simd simdlen(SIMD_VEC)')
      FOR('i','0','SIMD_VEC')
      line = name+'_vec('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if (not do_gen_direct_simd_arrays) and maps[g_m] == OP_ID:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)],'
        elif maps[g_m] == OP_GBL and accs[g_m] == OP_READ:
          line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data,'
        elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC:
          line = line + indent +'&dat'+str(g_m)+'[i],'
        else:
          line = line + indent + 'dat'+str(g_m)+','
      line = line +indent +'i);'
      code(line)
      ENDFOR()
      #do the scatters
      FOR('i','0','SIMD_VEC')
      if nmaps > 0:
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP :
            if (accs[g_m] in [OP_INC, OP_RW, OP_WRITE]):#and (not mapinds[g_m] in k):
              code(idx_map_template.format(g_m, invmapinds[inds[g_m]-1], idxs[g_m]))
          elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID :
            if (accs[g_m] in [OP_INC, OP_RW, OP_WRITE]):
              code(idx_id_template.format(g_m))
      code('')
      dat_scatter_inc_template = "(ptr{0})[idx{0}_<DIM> + {1}] += dat{0}[{1}][i];"
      dat_scatter_wr_template  = "(ptr{0})[idx{0}_<DIM> + {1}] = dat{0}[{1}][i];"
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP :
          if (accs[g_m] == OP_INC ):
            for d in range(0,int(dims[g_m])):
              code(dat_scatter_inc_template.format(g_m, d))
            code('')
          elif accs[g_m] in [OP_WRITE, OP_RW]:
            for d in range(0,int(dims[g_m])):
              code(dat_scatter_wr_template.format(g_m, d))
            code('')
        elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID:
          ## also scatter directly-written data
          if (accs[g_m] == OP_INC ):
            for d in range(0,int(dims[g_m])):
              code(dat_scatter_inc_template.format(g_m, d))
          elif accs[g_m] in [OP_WRITE, OP_RW]:
            for d in range(0,int(dims[g_m])):
              code(dat_scatter_wr_template.format(g_m, d))
            code('')
      ENDFOR()

      #do reductions
      for g_m in range(0,nargs):
        if maps[g_m] == OP_GBL:
          FOR('i','0','SIMD_VEC')
          if accs[g_m] == OP_INC:
            code('*(<TYP>*)arg'+str(g_m)+'.data += dat'+str(g_m)+'[i];')
          elif accs[g_m] == OP_MAX:
            code('*(<TYP>*)arg'+str(g_m)+'.data = MAX(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);')
          elif accs[g_m] == OP_MIN:
            code('*(<TYP>*)arg'+str(g_m)+'.data = MIN(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);')
          ENDFOR()


      ENDFOR()
      code('')
      comm('remainder')
      FOR('n','(exec_size/SIMD_VEC)*SIMD_VEC','exec_size')
      depth = depth -2
      code('#else')
      FOR('n','0','exec_size')
      depth = depth -2
      code('#endif')
      depth = depth +2
      IF('n==set->core_size')
      if grouped:
        code('op_mpi_wait_all_grouped(nargs, args, 1);')
      else:
        code('op_mpi_wait_all(nargs, args);')
      ENDIF()
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')
      #do non-optional ones
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]):
            k = k + [mapinds[g_m]]
            code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];')
      #do optional ones
      if nmaps > 0:
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            if optflags[g_m]:
              IF('<ARG>.opt')
            else:
              k = k + [mapinds[g_m]]
            code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];')
            if optflags[g_m]:
              ENDIF()

      code('')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * n]'
        if maps[g_m] == OP_MAP:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()

#
# kernel call for direct version
#
    else:
      code('#ifdef VECTORIZE')

      code('#pragma novector')
      FOR2('n','0','(exec_size/SIMD_VEC)*SIMD_VEC','SIMD_VEC')

	  #initialize globals
      for g_m in range(0,nargs):
        if maps[g_m] == OP_GBL:
          code('<TYP> dat{0}[SIMD_VEC];'.format(g_m))
          FOR('i','0','SIMD_VEC')
          if accs[g_m] == OP_INC:
            code('dat{0}[i] = 0.0;'.format(g_m))
          elif accs[g_m] == OP_MAX:
            code('dat{0}[i] = -INFINITY;'.format(g_m))
          elif accs[g_m] == OP_MIN:
            code('dat{0}[i] = INFINITY;'.format(g_m))
          elif accs[g_m] == OP_READ:
            code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m))
          ENDFOR()

      code('#pragma omp simd simdlen(SIMD_VEC)')
      FOR('i','0','SIMD_VEC')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)]'
        if maps[g_m] == OP_MAP:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'&dat'+str(g_m)+'[i]'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()
      #do reductions
      for g_m in range(0,nargs):
        if maps[g_m] == OP_GBL:
          FOR('i','0','SIMD_VEC')
          if accs[g_m] == OP_INC:
            code('*(<TYP>*)arg'+str(g_m)+'.data += dat'+str(g_m)+'[i];')
          elif accs[g_m] == OP_MAX:
            code('*(<TYP>*)arg'+str(g_m)+'.data = MAX(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);')
          elif accs[g_m] == OP_MIN:
            code('*(<TYP>*)arg'+str(g_m)+'.data = MIN(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);')
          ENDFOR()
      ENDFOR()

      comm('remainder')
      FOR ('n','(exec_size/SIMD_VEC)*SIMD_VEC','exec_size')
      depth = depth -2
      code('#else')
      FOR('n','0','exec_size')
      depth = depth -2
      code('#endif')
      depth = depth +2
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+'*n]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()
    ENDIF()
    code('')

    #zero set size issues
    if ninds>0:
      IF('exec_size == 0 || exec_size == set->core_size')
      if grouped:
        code('op_mpi_wait_all_grouped(nargs, args, 1);')
      else:
        code('op_mpi_wait_all(nargs, args);')
      ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
    comm(' combine reduction data')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ:
        code('op_mpi_reduce(&<ARG>,('+typs[g_m]+'*)<ARG>.data);')

    code('op_mpi_set_dirtybit(nargs, args);')
    code('')

#
# update kernel record
#

    comm(' update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if maps[g_m]!=OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' <ARG>.size;')
          else:
            code(line+' <ARG>.size * 2.0f;')
    else:
      names = []
      for g_m in range(0,ninds):
        mult=''
        if indaccs[g_m] != OP_WRITE and indaccs[g_m] != OP_READ:
          mult = ' * 2.0f'
        if not var[invinds[g_m]] in names:
          code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[g_m])+'.size'+mult+';')
          names = names + [var[invinds[g_m]]]
      for g_m in range(0,nargs):
        mult=''
        if accs[g_m] != OP_WRITE and accs[g_m] != OP_READ:
          mult = ' * 2.0f'
        if not var[g_m] in names:
          names = names + [var[invinds[g_m]]]
          if maps[g_m] == OP_ID:
            code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';')
          elif maps[g_m] == OP_GBL:
            code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[inds[g_m]-1])+'.map->dim * 4.0f;')

    depth -= 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('vec'):
        os.makedirs('vec')
    fid = open('vec/'+name+'_veckernel.cpp','w')
    date = datetime.datetime.now()
    #fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n')
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text =''

  code('#define double_ALIGN 128')
  code('#define float_ALIGN 64')
  code('#define int_ALIGN 64')
  code('#ifdef VECTORIZE')
  code('#define SIMD_VEC 4')
  code('#define ALIGNED_double __attribute__((aligned(double_ALIGN)))')
  code('#define ALIGNED_float __attribute__((aligned(float_ALIGN)))')
  code('#define ALIGNED_int __attribute__((aligned(int_ALIGN)))')
  code('  #ifdef __ICC')
  code('    #define DECLARE_PTR_ALIGNED(X, Y) __assume_aligned(X, Y)')
  code('  #else')
  code('    #define DECLARE_PTR_ALIGNED(X, Y)')
  code('  #endif')
  code('#else')
  code('#define ALIGNED_double')
  code('#define ALIGNED_float')
  code('#define ALIGNED_int')
  code('#define DECLARE_PTR_ALIGNED(X, Y)')
  code('#endif')
  code('')

  comm(' global constants       ')

  for nc in range (0,len(consts)):
    if not consts[nc]['user_declared']:
      if consts[nc]['dim']==1:
        code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
      else:
        if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0:
          num = str(consts[nc]['dim'])
        else:
          num = 'MAX_CONST_SIZE'
        code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];')
  code('')

  comm(' header                 ')

  if os.path.exists('./user_types.h'):
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"')
  code('')

  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_veckernel.cpp"')
  master = master.split('.')[0]
  fid = open('vec/'+master.split('.')[0]+'_veckernels.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
def op2_gen_openmp_simple(master, date, consts, kernels):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  any_soa = 0
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):
    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
            ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
            unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]

#
# set two logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
        j = i
    reduct = j >= 0

##########################################################################
#  start with the user kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

    comm('user function')

    if FORTRAN:
      code('include '+name+'.inc')
    elif CPP:
      code('#include "../'+decl_filepath+'"')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm(' host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg <ARG>){');
        code('')
      else:
        code('op_arg <ARG>,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m] <> OP_READ:
        code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('<ARG>.idx = 0;')
        code('args['+str(g_m)+'] = <ARG>;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')
      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = <ARG>;')

#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    if insert_thread_timers:
      code("op_timing_realloc_manytime({0}, {1});".format(str(nk), "omp_get_max_threads()"))
    else:
      code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    if insert_thread_timers:
      code('double non_thread_walltime = 0.0;')
    code('')

#
#   indirect bits
#
    if ninds>0:
      code('int  ninds   = '+str(ninds)+';')
      line = 'int  inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

      code('')
      comm(' get plan')
      code('#ifdef OP_PART_SIZE_'+ str(nk))
      code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
      code('#else')
      code('  int part_size = OP_part_size;')
      code('#endif')
      code('')
      code('int set_size = op_mpi_halo_exchanges(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('op_mpi_halo_exchanges(set, nargs, args);')

#
# set number of threads in x86 execution and create arrays for reduction
#

    if reduct or ninds==0:
      comm(' set number of threads')
      code('#ifdef _OPENMP')
      code('  int nthreads = omp_get_max_threads();')
      code('#else')
      code('  int nthreads = 1;')
      code('#endif')

    if reduct:
      code('')
      comm(' allocate and initialise arrays for global reduction')
      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE:
          code('<TYP> <ARG>_l[nthreads*64];')
          FOR('thr','0','nthreads')
          if accs[g_m]==OP_INC:
            FOR('d','0','<DIM>')
            code('<ARG>_l[d+thr*64]=ZERO_<TYP>;')
            ENDFOR()
          else:
            FOR('d','0','<DIM>')
            code('<ARG>_l[d+thr*64]=<ARG>h[d];')
            ENDFOR()
          ENDFOR()

    code('')
    IF('set->size >0')
    code('')

#
# kernel call for indirect version
#
    if ninds>0:
      code('op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0);')
      code('')
      comm(' execute plan')
      code('int block_offset = 0;')
      FOR('col','0','Plan->ncolors')
      IF('col==Plan->ncolors_core')
      code('op_mpi_wait_all(nargs, args);')
      ENDIF()
      code('int nblocks = Plan->ncolblk[col];')
      code('')
      if insert_thread_timers:
        # Pause process timing and switch to per-thread timing:
        code('// Pause process timing and switch to per-thread timing:')
        code('op_timers_core(&cpu_t2, &wall_t2);')
        code('non_thread_walltime += wall_t2 - wall_t1;')

        code('#pragma omp parallel')
        code('{')
        depth += 2
        code('double thr_wall_t1, thr_wall_t2, thr_cpu_t1, thr_cpu_t2;')
        code('op_timers_core(&thr_cpu_t1, &thr_wall_t1);')
        code('')
        code('int nthreads = omp_get_num_threads();')
        code('int thr = omp_get_thread_num();')
        code('int thr_start = (nblocks * thr) / nthreads;')
        code('int thr_end = (nblocks * (thr+1)) / nthreads;')
        code('if (thr_end > nblocks) thr_end = nblocks;')
        FOR('blockIdx','thr_start','thr_end')
      else:
        code('#pragma omp parallel for')
        FOR('blockIdx','0','nblocks')

      code('int blockId  = Plan->blkmap[blockIdx + block_offset];')
      code('int nelem    = Plan->nelems[blockId];')
      code('int offset_b = Plan->offset[blockId];')
      FOR('n','offset_b','offset_b+nelem')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+\
              '.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];')
      code('')
      for g_m in range (0,nargs):
        u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
        if len(u) > 0 and vectorised[g_m] > 0:
          if accs[g_m] == OP_READ:
            line = 'const <TYP>* <ARG>_vec[] = {\n'
          else:
            line = '<TYP>* <ARG>_vec[] = {\n'

          v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
          first = [i for i in range(0,len(v)) if v[i] == 1]
          first = first[0]

          indent = ' '*(depth+2)
          for k in range(0,sum(v)):
            line = line + indent + ' &((<TYP>*)arg'+str(first)+'.data)[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n'
          line = line[:-2]+'};'
          code(line)
      code('')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+' * n]'
        if maps[g_m] == OP_MAP:
          if vectorised[g_m]:
            if g_m+1 in unique_args:
                line = line + indent + 'arg'+str(g_m)+'_vec'
          else:
            line = line + indent + '&(('+typs[g_m]+'*)arg'+str(invinds[inds[g_m]-1])+'.data)['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
        if maps[g_m] == OP_GBL:
          if accs[g_m] <> OP_READ and accs[g_m] <> OP_WRITE:
            line = line + indent +'&arg'+str(g_m)+'_l[64*omp_get_thread_num()]'
          else:
            line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1:
          if g_m+1 in unique_args and not g_m+1 == unique_args[-1]:
            line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()
      if insert_thread_timers:
        depth -= 2
        code('}')
        code('')
        code('op_timers_core(&thr_cpu_t2, &thr_wall_t2);')
        code('OP_kernels[' +str(nk)+ '].times[thr]  += thr_wall_t2 - thr_wall_t1;')
      ENDFOR()
      code('')

      if reduct:
        comm(' combine reduction data')
        IF('col == Plan->ncolors_owned-1')
        for m in range(0,nargs):
          if maps[m] == OP_GBL and accs[m] <> OP_READ:
            FOR('thr','0','nthreads')
            if accs[m]==OP_INC:
              FOR('d','0','<DIM>')
              code('<ARG>h[d] += <ARG>_l[d+thr*64];')
              ENDFOR()
            elif accs[m]==OP_MIN:
              FOR('d','0','<DIM>')
              code('<ARG>h[d]  = MIN(<ARG>h[d],<ARG>_l[d+thr*64]);')
              ENDFOR()
            elif  accs[m]==OP_MAX:
              FOR('d','0','<DIM>')
              code('<ARG>h[d]  = MAX(<ARG>h[d],<ARG>_l[d+thr*64]);')
              ENDFOR()
            else:
              error('internal error: invalid reduction option')
            ENDFOR()
        ENDIF()

      if insert_thread_timers:
        code('// Revert to process-level timing:')
        code('op_timers_core(&cpu_t1, &wall_t1);')
        code('')
      code('block_offset += nblocks;');
      ENDIF()

#
# kernel call for direct version
#
    else:
      comm(' execute plan')
      if insert_thread_timers:
        # Pause process timing, and switch to per-thread timing:
        code('// Pause process timing, and switch to per-thread timing:')
        code('op_timers_core(&cpu_t2, &wall_t2);')
        code('non_thread_walltime += wall_t2 - wall_t1;')
      code('#pragma omp parallel for')
      FOR('thr','0','nthreads')
      if insert_thread_timers:
        code('double thr_wall_t1, thr_wall_t2, thr_cpu_t1, thr_cpu_t2;')
        code('op_timers_core(&thr_cpu_t1, &thr_wall_t1);')
      code('int start  = (set->size* thr)/nthreads;')
      code('int finish = (set->size*(thr+1))/nthreads;')
      FOR('n','start','finish')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+'*n]'
        if maps[g_m] == OP_GBL:
          if accs[g_m] <> OP_READ and accs[g_m] <> OP_WRITE:
            line = line + indent +'&arg'+str(g_m)+'_l[64*omp_get_thread_num()]'
          else:
            line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()
      if insert_thread_timers:
        code('op_timers_core(&thr_cpu_t2, &thr_wall_t2);')
        code('OP_kernels['+str(nk)+'].times[thr]  += thr_wall_t2 - thr_wall_t1;')
      ENDFOR()
      if insert_thread_timers:
        # OpenMP block complete, so switch back to process timing:
        code('// OpenMP block complete, so switch back to process timing:')
        code('op_timers_core(&cpu_t1, &wall_t1);')

    if ninds>0:
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')

    ENDIF()
    code('')

    #zero set size issues
    if ninds>0:
      IF('set_size == 0 || set_size == set->core_size')
      code('op_mpi_wait_all(nargs, args);')
      ENDIF()

#
# combine reduction data from multiple OpenMP threads, direct version
#
    comm(' combine reduction data')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE and ninds==0:
        FOR('thr','0','nthreads')
        if accs[g_m]==OP_INC:
          FOR('d','0','<DIM>')
          code('<ARG>h[d] += <ARG>_l[d+thr*64];')
          ENDFOR()
        elif accs[g_m]==OP_MIN:
          FOR('d','0','<DIM>')
          code('<ARG>h[d]  = MIN(<ARG>h[d],<ARG>_l[d+thr*64]);')
          ENDFOR()
        elif accs[g_m]==OP_MAX:
          FOR('d','0','<DIM>')
          code('<ARG>h[d]  = MAX(<ARG>h[d],<ARG>_l[d+thr*64]);')
          ENDFOR()
        else:
          print 'internal error: invalid reduction option'
        ENDFOR()
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
        code('op_mpi_reduce(&<ARG>,<ARG>h);')

    code('op_mpi_set_dirtybit(nargs, args);')
    code('')

#
# update kernel record
#

    comm(' update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    if insert_thread_timers:
      code('non_thread_walltime += wall_t2 - wall_t1;')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    if insert_thread_timers:
        code('OP_kernels[' +str(nk)+ '].times[0] += non_thread_walltime;')
    else:
        code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('<ARG>.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' <ARG>.size;')
          else:
            code(line+' <ARG>.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()

    depth -= 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('openmp'):
        os.makedirs('openmp')
    fid = open('openmp/'+name+'_kernel.cpp','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text =''

  code('#ifdef _OPENMP')
  code('  #include <omp.h>')
  code('#endif')
  code('')

  comm(' global constants       ')

  for nc in range (0,len(consts)):
    if not consts[nc]['user_declared']:
      if consts[nc]['dim']==1:
        code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
      else:
        if consts[nc]['dim'] > 0:
          num = str(consts[nc]['dim'])
        else:
          num = 'MAX_CONST_SIZE'

        code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];')
  code('')

  comm(' header                 ')

  if os.path.exists('./user_types.h'):
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"       ')
  code('')

  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_kernel.cpp"')
  master = master.split('.')[0]
  fid = open('openmp/'+master.split('.')[0]+'_kernels.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
示例#5
0
def op2_gen_seq(master, date, consts, kernels):

    global dims, idxs, typs, indtyps, inddims
    global FORTRAN, CPP, g_m, file_text, depth

    OP_ID = 1
    OP_GBL = 2
    OP_MAP = 3

    OP_READ = 1
    OP_WRITE = 2
    OP_RW = 3
    OP_INC = 4
    OP_MAX = 5
    OP_MIN = 6

    accsstring = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', 'OP_MAX', 'OP_MIN']

    any_soa = 0
    for nk in range(0, len(kernels)):
        any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

    for nk in range(0, len(kernels)):

        name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
                ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
                unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

        optidxs = [0] * nargs
        indopts = [-1] * nargs
        nopts = 0
        for i in range(0, nargs):
            if optflags[i] == 1 and maps[i] == OP_ID:
                optidxs[i] = nopts
                nopts = nopts + 1
            elif optflags[i] == 1 and maps[i] == OP_MAP:
                if i == invinds[
                        inds[i] -
                        1]:  #i.e. I am the first occurence of this dat+map combination
                    optidxs[i] = nopts
                    indopts[inds[i] - 1] = i
                    nopts = nopts + 1
                else:
                    optidxs[i] = optidxs[invinds[inds[i] - 1]]
#
# set two logicals
#
        j = 0
        for i in range(0, nargs):
            if maps[i] == OP_MAP and accs[i] == OP_INC:
                j = i
        ind_inc = j > 0

        j = 0
        for i in range(0, nargs):
            if maps[i] == OP_GBL and accs[i] <> OP_READ:
                j = i
        reduct = j > 0

        ##########################################################################
        #  start with the user kernel function
        ##########################################################################

        FORTRAN = 0
        CPP = 1
        g_m = 0
        file_text = ''
        depth = 0

        comm('user function')

        if FORTRAN:
            code('include ' + name + '.inc')
        elif CPP:
            code('#include "../' + decl_filepath + '"')

##########################################################################
# then C++ stub function
##########################################################################

        code('')
        comm(' host stub function')
        code('void op_par_loop_' + name + '(char const *name, op_set set,')
        depth += 2

        for m in unique_args:
            g_m = m - 1
            if m == unique_args[len(unique_args) - 1]:
                code('op_arg ARG){')
                code('')
            else:
                code('op_arg ARG,')

        code('int nargs = ' + str(nargs) + ';')
        code('op_arg args[' + str(nargs) + '];')
        code('')

        for g_m in range(0, nargs):
            u = [
                i for i in range(0, len(unique_args))
                if unique_args[i] - 1 == g_m
            ]
            if len(u) > 0 and vectorised[g_m] > 0:
                code('ARG.idx = 0;')
                code('args[' + str(g_m) + '] = ARG;')

                v = [
                    int(vectorised[i] == vectorised[g_m])
                    for i in range(0, len(vectorised))
                ]
                first = [i for i in range(0, len(v)) if v[i] == 1]
                first = first[0]
                if (optflags[g_m] == 1):
                    argtyp = 'op_opt_arg_dat(arg' + str(first) + '.opt, '
                else:
                    argtyp = 'op_arg_dat('

                FOR('v', '1', str(sum(v)))
                code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
                str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');')
                ENDFOR()
                code('')
            elif vectorised[g_m] > 0:
                pass
            else:
                code('args[' + str(g_m) + '] = ARG;')

#
# start timing
#
        code('')
        comm(' initialise timers')
        code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
        code('op_timing_realloc(' + str(nk) + ');')
        code('op_timers_core(&cpu_t1, &wall_t1);')
        code('')

        #
        #   indirect bits
        #
        if ninds > 0:
            IF('OP_diags>2')
            code('printf(" kernel routine with indirection: ' + name +
                 '\\n");')
            ENDIF()

#
# direct bit
#
        else:
            code('')
            IF('OP_diags>2')
            code('printf(" kernel routine w/o indirection:  ' + name + '");')
            ENDIF()

        code('')
        code('int set_size = op_mpi_halo_exchanges(set, nargs, args);')

        code('')
        IF('set->size >0')
        code('')

        #
        # kernel call for indirect version
        #
        if ninds > 0:
            FOR('n', '0', 'set_size')
            IF('n==set->core_size')
            code('op_mpi_wait_all(nargs, args);')
            ENDIF()
            if nmaps > 0:
                k = []
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
                        k = k + [mapinds[g_m]]
                        code('int map' + str(mapinds[g_m]) + 'idx = arg' +
                             str(invmapinds[inds[g_m] - 1]) +
                             '.map_data[n * arg' +
                             str(invmapinds[inds[g_m] - 1]) + '.map->dim + ' +
                             str(idxs[g_m]) + '];')
            code('')
            for g_m in range(0, nargs):
                u = [
                    i for i in range(0, len(unique_args))
                    if unique_args[i] - 1 == g_m
                ]
                if len(u) > 0 and vectorised[g_m] > 0:
                    if accs[g_m] == OP_READ:
                        line = 'const TYP* ARG_vec[] = {\n'
                    else:
                        line = 'TYP* ARG_vec[] = {\n'

                    v = [
                        int(vectorised[i] == vectorised[g_m])
                        for i in range(0, len(vectorised))
                    ]
                    first = [i for i in range(0, len(v)) if v[i] == 1]
                    first = first[0]

                    indent = ' ' * (depth + 2)
                    for k in range(0, sum(v)):
                        line = line + indent + ' &((TYP*)arg' + str(
                            first) + '.data)[DIM * map' + str(
                                mapinds[g_m + k]) + 'idx],\n'
                    line = line[:-2] + '};'
                    code(line)
            code('')

            line = name + '('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&((' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data)[' + str(dims[g_m]) + ' * n]'
                if maps[g_m] == OP_MAP:
                    if vectorised[g_m]:
                        if g_m + 1 in unique_args:
                            line = line + indent + 'arg' + str(g_m) + '_vec'
                    else:
                        line = line + indent + '&((' + typs[
                            g_m] + '*)arg' + str(
                                invinds[inds[g_m] - 1]) + '.data)[' + str(
                                    dims[g_m]) + ' * map' + str(
                                        mapinds[g_m]) + 'idx]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '(' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data'
                if g_m < nargs - 1:
                    if g_m + 1 in unique_args and not g_m + 1 == unique_args[
                            -1]:
                        line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()

#
# kernel call for direct version
#
        else:
            FOR('n', '0', 'set_size')
            line = name + '('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + indent + '&((' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data)[' + str(dims[g_m]) + '*n]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '(' + typs[g_m] + '*)arg' + str(
                        g_m) + '.data'
                if g_m < nargs - 1:
                    line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()

        ENDIF()
        code('')

        #zero set size issues
        if ninds > 0:
            IF('set_size == 0 || set_size == set->core_size')
            code('op_mpi_wait_all(nargs, args);')
            ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
        comm(' combine reduction data')
        for g_m in range(0, nargs):
            if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
                #        code('op_mpi_reduce(&ARG,('+typs[g_m]+'*)ARG.data);')
                if typs[g_m] == 'double':  #need for both direct and indirect
                    code('op_mpi_reduce_double(&ARG,(' + typs[g_m] +
                         '*)ARG.data);')
                elif typs[g_m] == 'float':
                    code('op_mpi_reduce_float(&ARG,(' + typs[g_m] +
                         '*)ARG.data);')
                elif typs[g_m] == 'int':
                    code('op_mpi_reduce_int(&ARG,(' + typs[g_m] +
                         '*)ARG.data);')
                else:
                    print 'Type ' + typs[
                        g_m] + ' not supported in OpenACC code generator, please add it'
                    exit(-1)

        code('op_mpi_set_dirtybit(nargs, args);')
        code('')

        #
        # update kernel record
        #

        comm(' update kernel record')
        code('op_timers_core(&cpu_t2, &wall_t2);')
        code('OP_kernels[' + str(nk) + '].name      = name;')
        code('OP_kernels[' + str(nk) + '].count    += 1;')
        code('OP_kernels[' + str(nk) + '].time     += wall_t2 - wall_t1;')

        if ninds == 0:
            line = 'OP_kernels[' + str(nk) + '].transfer += (float)set->size *'

            for g_m in range(0, nargs):
                if optflags[g_m] == 1:
                    IF('ARG.opt')
                if maps[g_m] <> OP_GBL:
                    if accs[g_m] == OP_READ:
                        code(line + ' ARG.size;')
                    else:
                        code(line + ' ARG.size * 2.0f;')
                if optflags[g_m] == 1:
                    ENDIF()
        else:
            names = []
            for g_m in range(0, ninds):
                mult = ''
                if indaccs[g_m] <> OP_WRITE and indaccs[g_m] <> OP_READ:
                    mult = ' * 2.0f'
                if not var[invinds[g_m]] in names:
                    if optflags[g_m] == 1:
                        IF('arg' + str(invinds[g_m]) + '.opt')
                    code('OP_kernels[' + str(nk) +
                         '].transfer += (float)set->size * arg' +
                         str(invinds[g_m]) + '.size' + mult + ';')
                    if optflags[g_m] == 1:
                        ENDIF()
                    names = names + [var[invinds[g_m]]]
            for g_m in range(0, nargs):
                mult = ''
                if accs[g_m] <> OP_WRITE and accs[g_m] <> OP_READ:
                    mult = ' * 2.0f'
                if not var[g_m] in names:
                    names = names + [var[g_m]]
                    if optflags[g_m] == 1:
                        IF('ARG.opt')
                    if maps[g_m] == OP_ID:
                        code('OP_kernels[' + str(nk) +
                             '].transfer += (float)set->size * arg' +
                             str(g_m) + '.size' + mult + ';')
                    elif maps[g_m] == OP_GBL:
                        code('OP_kernels[' + str(nk) +
                             '].transfer += (float)set->size * arg' +
                             str(g_m) + '.size' + mult + ';')
                    if optflags[g_m] == 1:
                        ENDIF()
            if nmaps > 0:
                k = []
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
                        k = k + [mapnames[g_m]]
                        code('OP_kernels[' + str(nk) +
                             '].transfer += (float)set->size * arg' +
                             str(invinds[inds[g_m] - 1]) + '.map->dim * 4.0f;')

        depth -= 2
        code('}')

        ##########################################################################
        #  output individual kernel file
        ##########################################################################
        if not os.path.exists('seq'):
            os.makedirs('seq')
        fid = open('seq/' + name + '_seqkernel.cpp', 'w')
        date = datetime.datetime.now()
        fid.write('//\n// auto-generated by op2.py\n//\n\n')
        fid.write(file_text)
        fid.close()


# end of main kernel call loop

##########################################################################
#  output one master kernel file
##########################################################################

    file_text = ''

    comm(' global constants       ')

    for nc in range(0, len(consts)):
        if consts[nc]['dim'] == 1:
            code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                 consts[nc]['name'] + ';')
        else:
            if consts[nc]['dim'] > 0:
                num = str(consts[nc]['dim'])
            else:
                num = 'MAX_CONST_SIZE'

            code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                 consts[nc]['name'] + '[' + num + '];')
    code('')

    comm(' header                 ')

    if os.path.exists('./user_types.h'):
        code('#include "../user_types.h"')
    code('#include "op_lib_cpp.h"       ')
    code('')

    comm(' user kernel files')

    for nk in range(0, len(kernels)):
        code('#include "' + kernels[nk]['name'] + '_seqkernel.cpp"')
    master = master.split('.')[0]
    fid = open('seq/' + master.split('.')[0] + '_seqkernels.cpp', 'w')
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()
示例#6
0
def op2_gen_openmp4(master, date, consts, kernels):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]
  op2_compiler = os.getenv('OP2_COMPILER','0');
  any_soa = 0
  maptype = 'map'
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
            ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
            unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]
#
# set two logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ:
        j = i
    reduct = j >= 0

##########################################################################
#  start with the user kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

    comm('user function')

    #strides for SoA
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;')
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;')
      dir_soa = -1
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1):
          code('int direct_'+name+'_stride_OP2CONSTANT;')
          code('int direct_'+name+'_stride_OP2HOST=-1;')
          dir_soa = g_m
          break

    comm('user function')
    file_name = decl_filepath

    f = open(file_name, 'r')
    kernel_text = f.read()
    f.close()

    kernel_text = op2_gen_common.comment_remover(kernel_text)
    kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

    p = re.compile('void\\s+\\b'+name+'\\b')
    i = p.search(kernel_text).start()

    if(i < 0):
      print "\n********"
      print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation"
      exit(2)
    i2 = i

    #i = kernel_text[0:i].rfind('\n') #reverse find
    j = kernel_text[i:].find('{')
    k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}')
    signature_text = kernel_text[i:i+j]
    l = signature_text[0:].find('(')
    head_text = signature_text[0:l] #save function name
    m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
    signature_text = signature_text[l+1:m]
    body_text = kernel_text[i+j+1:k]

    # check for number of arguments
    if len(signature_text.split(',')) != nargs_novec:
        print 'Error parsing user kernel(%s): must have %d arguments' \
              % name, nargs
        return

    for i in range(0,nargs_novec):
        var = signature_text.split(',')[i].strip()
        if kernels[nk]['soaflags'][i]:
          var = var.replace('*','')
          #locate var in body and replace by adding [idx]
          length = len(re.compile('\\s+\\b').split(var))
          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()

          if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
            body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \
                op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
          else:
            body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
            body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \
                op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)

    for nc in range(0,len(consts)): 
      varname = consts[nc]['name']
      body_text = re.sub('\\b'+varname+'\\b', varname+'_ompkernel',body_text)
#      if consts[nc]['dim'] == 1:
#        body_text = re.sub(varname+'(?!\w)', varname+'_ompkernel', body_text)
#      else:
#        body_text = re.sub('\*'+varname+'(?!\[)', varname+'[0]', body_text)
#        body_text = re.sub(r''+varname+'\[([A-Za-z0-9]*)\]'+'', varname+r'_ompkernel[\1]', body_text)

    vec = 0
    for n in range(0,nargs):
      if (vectorised[n] == 1):
        vec = 1
    kernel_params = [ var.strip() for var in signature_text.split(',')]
    if vec:
      new_kernel_params = []
      for m in range(0,nargs_novec):
        if int(kernels[nk]['idxs'][m])<0 and int(kernels[nk]['maps'][m]) == OP_MAP:
          new_kernel_params = new_kernel_params + [kernel_params[m]]*int(-1*int(kernels[nk]['idxs'][m]))
        else:
          new_kernel_params = new_kernel_params + [kernel_params[m]]
      kernel_params = new_kernel_params

    # collect constants used by kernel
    kernel_consts = []
    for nc in range(0,len(consts)):
      if body_text.find(consts[nc]['name']+'_ompkernel') != -1:
        kernel_consts.append(nc)

############################################################
#  omp4 function call definition
############################################################
    code('')
    func_call_signaure_text = 'void ' + name + '_omp4_kernel('
    params = ''
    indent = '\n' + '  '
    k = []
    for g_m in range(0, nargs):
      if maps[g_m] == OP_GBL:
        params += indent + rep('TYP *ARG,',g_m)
      if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
        k = k + [invmapinds[inds[g_m]-1]]
        params += indent +  'int *map'+str(mapinds[g_m])+','
        if maptype == 'map':
          params += indent +  'int map'+str(mapinds[g_m])+'size,'
      if maps[g_m] == OP_ID:
        params += indent + rep('TYP *data'+str(g_m)+',', g_m)
        if maptype == 'map':
          params += indent +  'int dat'+str(g_m)+'size,'
    for m in range(1,ninds+1):
      g_m = invinds[m-1]
      params += indent + rep('TYP *data'+str(g_m)+',', g_m)
      if maptype == 'map':
        params += indent +  'int dat'+str(g_m)+'size,'
    if ninds>0:
      # add indirect kernel specific params to kernel func call 
      params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,'
    else:
      # add direct kernel specific params to kernel func call 
      params += indent + 'int count,'
    params += indent + 'int num_teams,' + indent + 'int nthread'
    #add strides for SoA to params
    if any_soa:
      indent = ','+indent
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            params += indent + 'int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT'
      
      if dir_soa<>-1:
        params += indent + 'int direct_'+name+'_stride_OP2CONSTANT'
    code(func_call_signaure_text+params+');')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm(' host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg ARG){');
        code('')
      else:
        code('op_arg ARG,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL: #and accs[g_m] <> OP_READ:
        code('TYP*ARGh = (TYP *)ARG.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('ARG.idx = 0;')
        code('args['+str(g_m)+'] = ARG;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')
      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = ARG;')

#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('')

#
#   indirect bits
#
    if ninds>0:
      code('int  ninds   = '+str(ninds)+';')
      line = 'int  inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()
      code('')
      comm(' get plan')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# get part and block size
#
    code('')
    code('#ifdef OP_PART_SIZE_'+ str(nk))
    code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
    code('#else')
    code('  int part_size = OP_part_size;')
    code('#endif')
    code('#ifdef OP_BLOCK_SIZE_'+ str(nk))
    code('  int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
    code('#else')
    code('  int nthread = OP_block_size;')
    code('#endif')

    code('')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL: #and accs[g_m]<>OP_READ:
        if not dims[g_m].isdigit() or int(dims[g_m]) > 1:
          print 'ERROR: OpenMP 4 does not support multi-dimensional variables'
          exit(-1)
        code('TYP ARG_l = ARGh[0];')

    if ninds > 0:
      code('')
      code('int ncolors = 0;')
      code('int set_size1 = set->size + set->exec_size;')
    code('')
    IF('set->size >0')
    #managing constants
    if any_soa:
      code('')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST;')
            ENDIF()
      if dir_soa<>-1:
          IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))')
          code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');')
          code('direct_'+name+'_stride_OP2CONSTANT = direct_'+name+'_stride_OP2HOST;')
          ENDIF()
    code('')
    comm('Set up typed device pointers for OpenMP')
    if nmaps > 0:
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
          k = k + [invmapinds[inds[g_m]-1]]
          code('int *map'+str(mapinds[g_m])+' = arg'+str(invmapinds[inds[g_m]-1])+'.map_data_d;')
          if maptype == 'map':
            code(' int map'+str(mapinds[g_m])+'size = arg'+str(invmapinds[inds[g_m]-1])+'.map->dim * set_size1;') 

    code('')
    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        code(typs[g_m]+'* data'+str(g_m)+' = ('+typs[g_m]+'*)arg'+str(g_m)+'.data_d;')
        if maptype == 'map':
          if optflags[g_m]:
              code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')
          else:
            code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')

    for m in range(1,ninds+1):
      g_m = invinds[m-1]
      code('TYP *data'+str(g_m)+' = (TYP *)ARG.data_d;')
      if maptype == 'map':
        if optflags[g_m]:
            code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')
        else:
          code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')
    
#
# prepare kernel params for indirect version
#
    if ninds>0:
      code('')
      code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);')
      code('ncolors = Plan->ncolors;')
      code('int *col_reord = Plan->col_reord;')
      code('')
      comm(' execute plan')
      FOR('col','0','Plan->ncolors')
      IF('col==1')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()
      code('int start = Plan->col_offsets[0][col];')
      code('int end = Plan->col_offsets[0][col+1];')
      code('')
#
# kernel function call
#
    indent = '\n' + ' ' * (depth+2)
    call_params = ','.join([ indent + re.sub(r'\*arg(\d+)',r'&arg\1_l',param.strip().split(' ')[-1]) for param in params.split(',')])
    call_params = call_params.replace('*','')
    # set params for indirect version
    if ninds>0:
      call_params = call_params.replace('num_teams','part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread')
    # set params for direct version
    else:
      call_params = re.sub('count','set->size',call_params);
      call_params = call_params.replace('num_teams','part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread') 
    code(func_call_signaure_text.split(' ')[-1]+call_params+');')
    code('')

    if ninds>0:
      if reduct:
        comm(' combine reduction data')
        IF('col == Plan->ncolors_owned-1')
        for g_m in range(0,nargs):
          if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
            if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE:
              code('ARGh[0] = ARG_l;')
            elif accs[g_m]==OP_MIN:
              code('ARGh[0]  = MIN(ARGh[0],ARG_l);')
            elif  accs[g_m]==OP_MAX:
              code('ARGh[0]  = MAX(ARGh[0],ARG_l);')
            else:
              error('internal error: invalid reduction option')
        ENDIF()
      ENDFOR()
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')

    ENDIF()
    code('')

    #zero set size issues
    if ninds>0:
      IF('set_size == 0 || set_size == set->core_size || ncolors == 1')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
    comm(' combine reduction data')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
        if ninds==0: #direct version only
          if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE:
            code('ARGh[0] = ARG_l;')
          elif accs[g_m]==OP_MIN:
            code('ARGh[0]  = MIN(ARGh[0],ARG_l);')
          elif accs[g_m]==OP_MAX:
            code('ARGh[0]  = MAX(ARGh[0],ARG_l);')
          else:
            print 'internal error: invalid reduction option'
        if typs[g_m] == 'double': #need for both direct and indirect
          code('op_mpi_reduce_double(&ARG,ARGh);')
        elif typs[g_m] == 'float':
          code('op_mpi_reduce_float(&ARG,ARGh);')
        elif typs[g_m] == 'int':
          code('op_mpi_reduce_int(&ARG,ARGh);')
        else:
          print 'Type '+typs[g_m]+' not supported in OpenMP4 code generator, please add it'
          exit(-1)


    code('op_mpi_set_dirtybit_cuda(nargs, args);')
    code('')

#
# update kernel record
#

    code('if (OP_diags>1) deviceSync();')
    comm(' update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('ARG.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' ARG.size;')
          else:
            code(line+' ARG.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()

    depth -= 2
    code('}')

##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('openmp4'):
        os.makedirs('openmp4')
    fid = open('openmp4/'+name+'_omp4kernel.cpp','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

##############################################################
# generate ****_omp4kernel_func.cpp
##############################################################
    file_text = ''

    code(func_call_signaure_text+params+'){')
    code('')
    depth += 2
    for g_m in range(0, nargs):
      if maps[g_m] == OP_GBL:
        code('TYP ARG_l = *ARG;')
    line = '#pragma omp target teams'
    if op2_compiler == 'clang':
      line +=' distribute parallel for schedule(static,1)\\\n' + (depth+2)*' '
    line +=' num_teams(num_teams) thread_limit(nthread) '
    map_clause = ''
    if maptype == 'map':
      map_clause = 'map(to:'
    elif maptype == 'is_device_ptr':
      map_clause = 'is_device_ptr('
       
    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        if maptype == 'map':
          map_clause += 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],'
        else:
          map_clause += 'data'+str(g_m)+','
    if map_clause != 'is_device_ptr(' and map_clause != 'map(to:':
      map_clause = map_clause[:-1]+')'
      line += map_clause
    # mapping global consts
    if len(kernel_consts) != 0:
      line += ' \\\n' + (depth+2)*' ' + 'map(to:'
      for nc in kernel_consts:
        line += ' ' + consts[nc]['name']+'_ompkernel,'
        if consts[nc]['dim'] != 1:
          if consts[nc]['dim'] > 0:
            num = str(consts[nc]['dim'])
          else:
            num = 'MAX_CONST_SIZE'
          line = line[:-1] + '[:'+ num +'],'
      line = line[:-1]+')'
    # prepare reduction
    reduction_string = ''
    reduction_mapping = ''
    if reduct:
      reduction_mapping ='\\\n'+(depth+2)*' '+ 'map(tofrom:'
      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
          if accs[g_m] == OP_INC:
            reduction_string += ' reduction(+:arg%d_l)' % g_m
            reduction_mapping += ' arg%d_l,' % g_m
          if accs[g_m] == OP_MIN:
            reduction_string += ' reduction(min:arg%d_l)' % g_m
            reduction_mapping += ' arg%d_l,' % g_m
          if accs[g_m] == OP_MAX:
            reduction_string += ' reduction(max:arg%d_l)' % g_m
            reduction_mapping += ' arg%d_l,' % g_m
          if accs[g_m] == OP_WRITE:
            reduction_mapping += ' arg%d_l,' % g_m
      reduction_mapping = reduction_mapping[0:-1]+')' 
#
# map extra pointers for indirect version
#
    if ninds>0:
      if maptype == 'map':
        line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],'
      else:
        line += '\\\n'+(depth+2)*' '+'map(to:col_reord,'
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
            k = k + [invmapinds[inds[g_m]-1]]
            if maptype == 'map':
              line = line + 'map'+str(mapinds[g_m])+'[0:map'+str(mapinds[g_m])+'size],'
            else:
              line = line + 'map'+str(mapinds[g_m])+','
      for m in range(1,ninds+1):
        g_m = invinds[m-1]
        if maptype == 'map':
          line = line + 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],'
        else:
          line = line + 'data'+str(g_m)+','
      line = line[:-1]+')'
#
# write omp pragma
#
    code(line + reduction_mapping + reduction_string)
    if op2_compiler != 'clang':
      line = '#pragma omp distribute parallel for schedule(static,1)'
      code(line + reduction_string)
#
# start for loop indirect version
#
    if ninds>0:
      FOR('e','start','end')
      code('int n_op = col_reord[e];')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\
              '[n_op + set_size1 * '+str(idxs[g_m])+'];')

      code('')
      for g_m in range (0,nargs):
        u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
        if len(u) > 0 and vectorised[g_m] > 0:
          if accs[g_m] == OP_READ:
            line = 'const TYP* ARG_vec[] = {\n'
          else:
            line = 'TYP* ARG_vec[] = {\n'

          v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
          first = [i for i in range(0,len(v)) if v[i] == 1]
          first = first[0]

          indent = ' '*(depth+2)
          for k in range(0,sum(v)):
            if soaflags[g_m]:
              line = line + indent + ' &data'+str(first)+'[map'+str(mapinds[g_m+k])+'idx],\n'
            else:
              line = line + indent + ' &data'+str(first)+'[DIM * map'+str(mapinds[g_m+k])+'idx],\n'
          line = line[:-2]+'};'
          code(line)
#
# direct version
#
    else:
      FOR('n_op','0','count')
#
# write inlined kernel function
#
    comm('variable mapping')
    for g_m in range(0,nargs):
      line = kernel_params[g_m] + ' = '
      if maps[g_m] == OP_ID:
        if soaflags[g_m]:
          line += '&data%d[n_op]' % g_m
        else:
          line += '&data'+str(g_m)+'['+str(dims[g_m])+'*n_op]'
      if maps[g_m] == OP_MAP:
        if vectorised[g_m]:
          if g_m+1 in unique_args:
            line += 'arg'+str(g_m)+'_vec'
          else:
            line = ''
        else:
          if soaflags[g_m]:
            line += '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]'
          else:
            line += '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
      if maps[g_m] == OP_GBL:
        line += '&arg%d_l' % g_m
      if len(line):
        line += ';'
        code(line)
    
    code('')
    comm('inline function')
    indent = ' ' * (depth-2)
    inline_body_text = ''
    for line in body_text.split('\n'):
      if len(line):
        inline_body_text += indent+line+'\n'
      else:
        inline_body_text += '\n'
    code(inline_body_text)
    comm('end inline func')

    ENDFOR()
    code('')
    # end kernel function
    for g_m in range(0, nargs):
      if maps[g_m] == OP_GBL:
        code('*ARG = ARG_l;')
    depth -= 2;
    code('}')


##########################################################################
#  output individual omp4kernel file
##########################################################################
    fid = open('openmp4/'+name+'_omp4kernel_func.cpp','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text =''
  comm(' header                 ')
  code('#include "op_lib_cpp.h"       ')
  code('')
  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_omp4kernel.cpp"')
  master = master.split('.')[0]
  fid = open('openmp4/'+master.split('.')[0]+'_omp4kernels.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()


##########################################################################
#  output omp4 master kernel file
##########################################################################

  file_text =''

  comm(' global constants       ')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel;')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'
      code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel['+num+'];')
  code('')

  comm(' header                 ')

  if os.path.exists('./user_types.h'):
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"       ')
  code('')

  code('void op_decl_const_char(int dim, char const *type,')
  code('  int size, char *dat, char const *name){')
  indent = ' ' * ( 2+ depth)
  line = '  ' 
  for nc in range (0,len(consts)):
    varname = consts[nc]['name']
    if nc > 0:
        line += ' else '
    line += 'if(!strcmp(name, "%s")) {\n' %varname + indent + 2*' ' + 'memcpy('
    if consts[nc]['dim']==1:
      line += '&'
    line += varname+ '_ompkernel, dat, dim*size);\n' + indent + '#pragma omp target enter data map(to:'+varname+'_ompkernel'
    if consts[nc]['dim'] !=1:
      line += '[:%s]' % str(consts[nc]['dim']) if consts[nc]['dim'] > 0 else 'MAX_CONST_SIZE'
    line += ')\n'+indent + '}'
  code(line)
  code('}')

  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_omp4kernel_func.cpp"')
  master = master.split('.')[0]
  fid = open('openmp4/'+master.split('.')[0]+'_omp4kernel_funcs.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
示例#7
0
def op2_gen_cuda(master, date, consts, kernels, sets):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  any_soa = 0
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
           ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
           unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]
#
# set two logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_RW:
        j = i
    ind_rw= j >= 0
    if (ind_rw):
      print 'Error: indirect OP_RW not supported by op2_gen_cuda code generator'
      sys.exit(2);


    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
        j = i
    reduct = j >= 0

##########################################################################
#  start with CUDA kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

    comm('user function')

    code('__device__')
    if FORTRAN:
      code('include '+name+'.inc')
    elif CPP:
      code('#include "../'+name+'.h"')

    comm('')
    comm(' CUDA kernel function')

    if FORTRAN:
      code('subroutine op_cuda_'+name+'(')
    elif CPP:
      code('__global__ void op_cuda_'+name+'(')

    depth = 2

    for g_m in range(0,ninds):
      if FORTRAN:
        code('INDTYP *ind_ARG,')
      elif CPP:
        code('INDTYP *ind_ARG,')

    if ninds>0:
      if FORTRAN:
        code('int   *ind_map,')
        code('short *arg_map,')
      elif CPP:
        code('int   *ind_map,')
        code('short *arg_map,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m] == OP_READ:
        # declared const for performance
        if FORTRAN:
          code('const TYP *ARG,')
        elif CPP:
          code('const TYP *ARG,')
      elif maps[g_m]==OP_ID and ninds>0:
        if FORTRAN:
          code('ARG,')
        elif CPP:
          code('TYP  *ARG,')
      elif maps[g_m]==OP_GBL or maps[g_m]==OP_ID:
        if FORTRAN:
          code('ARG,')
        elif CPP:
          code('TYP *ARG,')

    if ninds>0:
      if FORTRAN:
        code('int   *ind_arg_sizes,')
        code('int   *ind_arg_offs, ')
        code('int    block_offset, ')
        code('int   *blkmap,       ')
        code('int   *offset,       ')
        code('int   *nelems,       ')
        code('int   *ncolors,      ')
        code('int   *colors,       ')
        code('int   nblocks,       ')
        code('int   set_size) {    ')
      if CPP:
        code('int   *ind_arg_sizes,')
        code('int   *ind_arg_offs, ')
        code('int    block_offset, ')
        code('int   *blkmap,       ')
        code('int   *offset,       ')
        code('int   *nelems,       ')
        code('int   *ncolors,      ')
        code('int   *colors,       ')
        code('int   nblocks,       ')
        code('int   set_size) {    ')
    else:
      code('int   offset_s,    ')
      code('int   set_size ) {')
      code('')

    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
        code('TYP ARG_l[DIM];')
        if accs[g_m] == OP_INC:
          FOR('d','0','DIM')
          code('ARG_l[d]=ZERO_TYP;')
          ENDFOR()
        else:
          FOR('d','0','DIM')
          code('ARG_l[d]=ARG[d+blockIdx.x*DIM];')
          ENDFOR()
      elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
        code('TYP ARG_l[DIM];')
      elif (ninds==0 and maps[g_m]==OP_ID and dims[g_m]<>'1') and not(soaflags[g_m]):
        code('TYP ARG_l[DIM];')

    for m in range (1,ninds+1):
      g_m = m -1
      v = [int(inds[i]==m) for i in range(len(inds))]
      v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m]
      if sum(v)>1 and sum(v_i)>0: #check this sum(v_i)
        if indaccs[m-1] == OP_INC:
          ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1
          code('INDTYP *ARG_vec['+str(ind)+'] = {'); depth += 2;
          for n in range(0,nargs):
            if inds[n] == m:
              g_m = n
              code('ARG_l,')
          depth -= 2
          code('};')
        else:
          ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1
          if indaccs[m-1] == OP_READ:
            code('const INDTYP *ARG_vec['+str(ind)+'];')
          else:
            code('INDTYP *ARG_vec['+str(ind)+'];')
#
# lengthy code for general case with indirection
#
    if ninds>0:
      code('')
      for g_m in range (0,ninds):
        code('__shared__  int  *ind_ARG_map, ind_ARG_size;')
      for g_m in range (0,ninds):
        code('__shared__  INDTYP *ind_ARG_s;')

      if ind_inc:
        code('__shared__ int    nelems2, ncolor;')

      code('__shared__ int    nelem, offset_b;')
      code('')
      code('extern __shared__ char shared[];')
      code('')
      IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks')
      code('return;')
      ENDIF()
      IF('threadIdx.x==0')
      code('')
      comm('get sizes and shift pointers and direct-mapped data')
      code('')
      code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x  + block_offset];')
      code('')
      code('nelem    = nelems[blockId];')
      code('offset_b = offset[blockId];')
      code('')

      if ind_inc:
        code('nelems2  = blockDim.x*(1+(nelem-1)/blockDim.x);')
        code('ncolor   = ncolors[blockId];')
        code('')

      for g_m in range (0,ninds):
        code('ind_ARG_size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds)+'];')

      code('')
      
      for m in range (1,ninds+1):
        g_m = m - 1
        c = [i for i in range(len(inds)) if inds[i]==m]
        code('ind_ARG_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\
        '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds)+'];')

      code('')
      comm('set shared memory pointers')
      code('int nbytes = 0;')

      for g_m in range(0,ninds):
        code('ind_ARG_s = (INDTYP *) &shared[nbytes];')
        if g_m < ninds-1:
          code('nbytes    += ROUND_UP(ind_ARG_size*sizeof(INDTYP)*INDDIM);')

      ENDIF()
      code('__syncthreads(); // make sure all of above completed')
      code('')
      comm('copy indirect datasets into shared memory or zero increment')
      code('')

      for m in range(0,ninds):
        g_m = m
        if indaccs[m]==OP_READ or indaccs[m]==OP_RW or indaccs[m]==OP_INC:
          FOR_INC('n','threadIdx.x','ind_ARG_size*INDDIM','blockDim.x')
          if indaccs[m]==OP_READ or indaccs[m]==OP_RW:
            code('ind_arg'+str(m)+'_s[n] = ind_arg'+str(m)+'[n%'+inddims[m]+
            '+ind_arg'+str(m)+'_map[n/'+inddims[m]+']*'+inddims[m]+'];')
            code('')
          elif indaccs[m]==OP_INC:
            code('ind_ARG_s[n] = ZERO_INDTYP;')
          ENDFOR()

      code('')
      code('__syncthreads();')
      comm('process set elements')
      code('')

      if ind_inc:
        FOR_INC('n','threadIdx.x','nelems2','blockDim.x')
        code('int col2 = -1;')
        IF('n<nelem')
        comm('initialise local variables')

        for g_m in range(0,nargs):
          if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
            FOR('d','0','DIM')
            code('ARG_l[d] = ZERO_TYP;')
            ENDFOR()
      else:
        FOR_INC('n','threadIdx.x','nelem','blockDim.x')

#
# simple alternative when no indirection
#
    else:
      use_shared = 0;
      for m in range(0,nargs):
        if maps[m]<>OP_GBL and dims[m]<>'1':
          use_shared = 1

      if use_shared:
        code('int   tid = threadIdx.x%OP_WARPSIZE;')
        code('')
        code('extern __shared__ char shared[];')
        code('char *arg_s = shared + offset_s*(threadIdx.x/OP_WARPSIZE);')

      code('')
      comm('process set elements')
      FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x')

      if use_shared:
        code('int offset = n - tid;')
        code('int nelems = MIN(OP_WARPSIZE,set_size-offset);')
        comm('copy data into shared memory, then into local')

      for m in range(0,nargs):
        g_m = m
        if (maps[m]<>OP_GBL and accs[m]<>OP_WRITE and dims[m]<>'1') and not(soaflags[m]):
          FOR('m','0','DIM')
          code('((TYP *)arg_s)[tid+m*nelems] = ARG[tid+m*nelems+offset*DIM];')
          ENDFOR()
          code('')
          FOR('m','0','DIM')
          code('ARG_l[m] = ((TYP *)arg_s)[m+tid*DIM];')
          ENDFOR()
          code('')



#
# kernel call
#

    # xxx: array of pointers for non-locals
    for m in range(1,ninds+1):
      s = [i for i in range(len(inds)) if inds[i]==m]
      if sum(s)>1:
        if indaccs[m-1] <> OP_INC:
          code('')
          ctr = 0
          for n in range(0,nargs):
            if inds[n] == m and vectorised[n]:
              code('arg'+str(m-1)+'_vec['+str(ctr)+'] = ind_arg'+\
              str(inds[n]-1)+'_s+arg_map['+str(cumulative_indirect_index[n])+\
              '*set_size+n+offset_b]*'+str(dims[n])+';')
              ctr = ctr+1

    code('')
    comm('user-supplied kernel call')

    line = name+'('
    prefix = ' '*len(name)
    a = 0 #only apply indentation if its not the 0th argument
    indent =''
    for m in range (0, nargs):
      if a > 0:
        indent = '     '+' '*len(name)

      if maps[m] == OP_GBL:
        if accs[m] == OP_READ or accs[m] == OP_WRITE:
          line += rep(indent+'ARG,\n',m)
        else:
          line += rep(indent+'ARG_l,\n',m);
        a =a+1
      elif maps[m]==OP_MAP and  accs[m]==OP_INC and vectorised[m]==0:
        line += rep(indent+'ARG_l,\n',m)
        a =a+1
      elif maps[m]==OP_MAP and vectorised[m]==0:
        line += rep(indent+'ind_arg'+str(inds[m]-1)+'_s+arg_map['+\
        str(cumulative_indirect_index[m])+'*set_size+n+offset_b]*DIM,'+'\n',m)
        a =a+1
      elif maps[m]==OP_MAP and m == 0:
        line += rep(indent+'ARG_vec,'+'\n',inds[m]-1)
        a =a+1
      elif maps[m]==OP_MAP and m>0 and vectorised[m] <> vectorised[m-1]: #xxx:vector
        line += rep(indent+'ARG_vec,'+'\n',inds[m]-1)
        a =a+1
      elif maps[m]==OP_MAP and m>0 and vectorised[m] == vectorised[m-1]:
        line = line
        a =a+1
      elif maps[m]==OP_ID:
        if ninds>0:
          if soaflags[m]:
            line += rep(indent+'ARG+(n+offset_b),\n',m)
          else:
            line += rep(indent+'ARG+(n+offset_b)*DIM,\n',m)
          a =a+1
        else:
          if dims[m] == '1' or soaflags[m]:
            line += rep(indent+'ARG+n,\n',m)
          else:
            line += rep(indent+'ARG_l,\n',m)
          a =a+1
      else:
        print 'internal error 1 '

    code(line[0:-2]+');') #remove final ',' and \n

#
# updating for indirect kernels ...
#
    if ninds>0:
      if ind_inc:
        code('col2 = colors[n+offset_b];')
        ENDIF()
        code('')
        comm('store local variables')
        code('')

        for g_m in range(0,nargs):
          if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
            code('int ARG_map;')

        IF('col2>=0')

        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
            code('ARG_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];')

        ENDIF()
        code('')
        FOR('col','0','ncolor')
        IF('col2==col')

        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
            FOR('d','0','DIM')
            code('ind_arg'+str(inds[g_m]-1)+'_s[d+ARG_map*DIM] += ARG_l[d];')
            ENDFOR()

        ENDFOR()
        code('__syncthreads();')
        ENDFOR()
      ENDFOR()

      s = [i for i in range(1,ninds+1) if indaccs[i-1]<> OP_READ]

      if len(s)>0 and max(s)>0:
        code('')
        comm('apply pointered write/increment')

      for g_m in range(0,ninds):
        if indaccs[g_m]==OP_WRITE or indaccs[g_m]==OP_RW or indaccs[g_m]==OP_INC:
          FOR_INC('n','threadIdx.x','INDARG_size*INDDIM','blockDim.x')
          if indaccs[g_m]==OP_WRITE or indaccs[g_m]==OP_RW:
            code('INDARG[n%INDDIM+INDARG_map[n/INDDIM]*INDDIM] = INDARG_s[n];')
          elif indaccs[g_m]==OP_INC:
            code('INDARG[n%INDDIM+INDARG_map[n/INDDIM]*INDDIM] += INDARG_s[n];')
          ENDFOR()
#
# ... and direct kernels
#
    else:
      if use_shared:
        comm('copy back into shared memory, then to device')
      for m in range(0,nargs):
        g_m = m
        if (maps[m]<>OP_GBL and accs[m]<>OP_READ and dims[m]<>'1') and not(soaflags[m]):
          code('')
          FOR('m','0','DIM')
          code('((TYP *)arg_s)[m+tid*DIM] = ARG_l[m];')
          ENDFOR()
          FOR('m','0','DIM')
          code('ARG[tid+m*nelems+offset*DIM] = ((TYP *)arg_s)[tid+m*nelems];')
          ENDFOR()

      depth -= 2
      code('}')

#
# global reduction
#
    if reduct:
       code('')
       comm('global reductions')
       code('')
       for m in range (0,nargs):
         g_m = m
         if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE:
           FOR('d','0','DIM')
           if accs[m]==OP_INC:
             code('op_reduction<OP_INC>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);')
           elif accs[m]==OP_MIN:
             code('op_reduction<OP_MIN>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);')
           elif accs[m]==OP_MAX:
             code('op_reduction<OP_MAX>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);')
           else:
             print 'internal error: invalid reduction option'
             sys.exit(2);
           ENDFOR()
    depth -= 2
    code('}')
    code('')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm('host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg ARG){')
        code('')
      else:
        code('op_arg ARG,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL:
        code('TYP*ARGh = (TYP *)ARG.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    #print vectorised

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('ARG.idx = 0;')
        code('args['+str(g_m)+'] = ARG;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')

      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = ARG;')

#
#   indirect bits
#
    if ninds>0:
      code('')
      code('int    ninds   = '+str(ninds)+';')
      line = '  int    inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

      code('')
      comm('get plan')
      code('#ifdef OP_PART_SIZE_'+ str(nk))
      code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
      code('#else')
      code('  int part_size = OP_part_size;')
      code('#endif')
      code('')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('op_mpi_halo_exchanges_cuda(set, nargs, args);')
#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('')

    IF('set->size > 0')
    code('')
    code('op_timing_realloc('+str(nk)+');')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('')
    if any_soa:
      code('int op2_stride_internal = set->size + set->exec_size + set->nonexec_size;')
      #code('op_decl_const_char(1, "int", sizeof(int), (char *)&op2_stride, "op2_stride");')
      code('cutilSafeCall(cudaMemcpyToSymbol(op2_stride , &op2_stride_internal, sizeof(int)));');
      code('')

#
# kernel call for indirect version
#
    if ninds>0:
      code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);')
      code('')


#
# transfer constants
#
    g = [i for i in range(0,nargs) if maps[i] == OP_GBL and (accs[i] == OP_READ or accs[i] == OP_WRITE)]
    if len(g)>0:
      comm('transfer constants to GPU')
      code('int consts_bytes = 0;')
      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m]==OP_WRITE):
          code('consts_bytes += ROUND_UP(DIM*sizeof(TYP));')

      code('reallocConstArrays(consts_bytes);')
      code('consts_bytes = 0;')

      for m in range(0,nargs):
        if maps[m]==OP_GBL and (accs[m]==OP_READ  or accs[m]==OP_WRITE):
          g_m = m
          code('ARG.data   = OP_consts_h + consts_bytes;')
          code('ARG.data_d = OP_consts_d + consts_bytes;')
          FOR('d','0','DIM')
          code('((TYP *)ARG.data)[d] = ARGh[d];')
          ENDFOR()
          code('consts_bytes += ROUND_UP(DIM*sizeof(TYP));')
      code('mvConstArraysToDevice(consts_bytes);')
      code('')


#
# transfer global reduction initial data
#

    if ninds == 0:
      comm('set CUDA execution parameters')
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('  int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      comm('  int nthread = OP_block_size;')
      code('  int nthread = 128;')
      code('#endif')
      code('')
      code('int nblocks = 200;')
      code('')

    if reduct:
      comm('transfer global reduction data to GPU')
      if ninds>0:
        code('int maxblocks = 0;')
        FOR('col','0','Plan->ncolors')
        code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);')
        ENDFOR()
      else:
        code('int maxblocks = nblocks;')

      code('int reduct_bytes = 0;')
      code('int reduct_size  = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('reduct_bytes += ROUND_UP(maxblocks*DIM*sizeof(TYP));')
          code('reduct_size   = MAX(reduct_size,sizeof(TYP));')

      code('reallocReductArrays(reduct_bytes);')
      code('reduct_bytes = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('ARG.data   = OP_reduct_h + reduct_bytes;')
          code('ARG.data_d = OP_reduct_d + reduct_bytes;')
          FOR('b','0','maxblocks')
          FOR('d','0','DIM')
          if accs[g_m]==OP_INC:
            code('((TYP *)ARG.data)[d+b*DIM] = ZERO_TYP;')
          else:
            code('((TYP *)ARG.data)[d+b*DIM] = ARGh[d];')
          ENDFOR()
          ENDFOR()
          code('reduct_bytes += ROUND_UP(maxblocks*DIM*sizeof(TYP));')
      code('mvReductArraysToDevice(reduct_bytes);')
      code('')

#
# kernel call for indirect version
#
    if ninds>0:
      comm('execute plan')
      code('')
      code('int block_offset = 0;')
      FOR('col','0','Plan->ncolors')
      IF('col==Plan->ncolors_core')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      code('int nthread = OP_block_size;')
      code('#endif')
      code('')
      code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],')
      code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);')
      IF('Plan->ncolblk[col] > 0')

      if reduct:
        code('int nshared = MAX(Plan->nshared,reduct_size*nthread);')
      else:
        code('int nshared = Plan->nsharedCol[col];')

      code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')

      for m in range(1,ninds+1):
        g_m = invinds[m-1]
        code('(TYP *)ARG.data_d,')

      code('Plan->ind_map,')
      code('Plan->loc_map,')

      for g_m in range(0,nargs):
        if inds[g_m]==0:
          code('(TYP*)ARG.data_d,')


      code('Plan->ind_sizes,')
      code('Plan->ind_offs,')
      code('block_offset,')
      code('Plan->blkmap,')
      code('Plan->offset,')
      code('Plan->nelems,')
      code('Plan->nthrcol,')
      code('Plan->thrcol,')
      code('Plan->ncolblk[col],')
      code('set_size);')
      code('')
      if reduct:
        comm('transfer global reduction data back to CPU')
        IF('col == Plan->ncolors_owned-1')
        code('mvReductArraysToHost(reduct_bytes);')
        ENDIF()

      ENDFOR()
      code('block_offset += Plan->ncolblk[col];')
      ENDIF()
#
# kernel call for direct version
#
    else:
      comm('work out shared memory requirements per element')
      code('')
      code('int nshared = 0;')

      for g_m in range(0,nargs):
         if maps[g_m]<>OP_GBL and dims[g_m]<>'1':
           code('nshared = MAX(nshared,sizeof(TYP)*DIM);')

      code('')
      comm('execute plan')
      code('int offset_s = nshared*OP_WARPSIZE;')
      code('')

      if reduct:
        code('nshared = MAX(nshared*nthread,reduct_size*nthread);')
      else:
        code('nshared = nshared*nthread;')

      code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')

      indent = '  '#*(len(name)+42)
      for g_m in range(0,nargs):
        if g_m > 0:
          code(indent+'(TYP *) ARG.data_d,')
        else:
          code(indent+'(TYP *) ARG.data_d,')

      code(indent+'offset_s,')
      code(indent+'set->size );')

    if ninds>0:
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')


#
# transfer global reduction initial data
#
    if reduct:
      if ninds == 0:
        comm('transfer global reduction data back to CPU')
        code('mvReductArraysToHost(reduct_bytes);')

      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m]<>OP_WRITE:
          FOR('b','0','maxblocks')
          FOR('d','0','DIM')
          if accs[m]==OP_INC:
            code('ARGh[d] = ARGh[d] + ((TYP *)ARG.data)[d+b*DIM];')
          elif accs[m]==OP_MIN:
            code('ARGh[d] = MIN(ARGh[d],((TYP *)ARG.data)[d+b*DIM]);')
          elif accs[m]==OP_MAX:
            code('ARGh[d] = MAX(ARGh[d],((TYP *)ARG.data)[d+b*DIM]);')
          ENDFOR()
          ENDFOR()

          code('ARG.data = (char *)ARGh;')
          code('op_mpi_reduce(&ARG,ARGh);')

    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        code('mvConstArraysToHost(consts_bytes);')
        break

    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        FOR('d','0','DIM')
        code('ARGh[d] = ((TYP *)ARG.data)[d];') 
        ENDFOR()
        code('ARG.data = (char *)ARGh;')
        code('op_mpi_reduce(&ARG,ARGh);')

    ENDIF()
    code('op_mpi_set_dirtybit_cuda(nargs, args);')

#
# update kernel record
#

    comm('update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ or accs[g_m]==OP_WRITE:
            code(line+' ARG.size;')
          else:
            code(line+' ARG.size * 2.0f;')

    depth = depth - 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('cuda'):
        os.makedirs('cuda')
    fid = open('cuda/'+name+'_kernel.cu','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text = ''

  comm('global constants')

  code('#ifndef MAX_CONST_SIZE')
  code('#define MAX_CONST_SIZE 128')
  code('#endif')
  code('')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'

      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];')
  code('')

  comm('header')
  if os.path.exists('./user_types.h'):
    code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif')
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"')
  code('#include "op_cuda_rt_support.h"')
  code('#include "op_cuda_reduction.h"')
  code('')

  # if any_soa:
  #   code('__constant__ int op2_stride;')
  #   code('')
  #   code('#define OP2_STRIDE(arr, idx) arr[op2_stride*(idx)]')

  code('')
  code('void op_decl_const_char(int dim, char const *type,')
  code('int size, char *dat, char const *name){')
  depth = depth + 2

  for nc in range(0,len(consts)):
    IF('!strcmp(name,"'+consts[nc]['name']+'")')
    if consts[nc]['dim'] < 0:
      IF('!strcmp(name,"'+consts[nc]['name']+'") && size>MAX_CONST_SIZE) {')
      code('printf("error: MAX_CONST_SIZE not big enough\n"); exit(1);')
      ENDIF()
    code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+', dat, dim*size));')
    ENDIF()
    code('else ')

  code('{')
  depth = depth + 2
  code('printf("error: unknown const name\\n"); exit(1);')
  ENDIF()


  depth = depth - 2
  code('}')
  code('')
  comm('user kernel files')

  for nk in range(0,len(kernels)):
    file_text = file_text +\
    '#include "'+kernels[nk]['name']+'_kernel.cu"\n'

  master = master.split('.')[0]
  fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w')
  fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n')
  fid.write(file_text)
  fid.close()
示例#8
0
def op2_gen_openacc(master, date, consts, kernels):

    global dims, idxs, typs, indtyps, inddims
    global FORTRAN, CPP, g_m, file_text, depth

    OP_ID = 1
    OP_GBL = 2
    OP_MAP = 3

    OP_READ = 1
    OP_WRITE = 2
    OP_RW = 3
    OP_INC = 4
    OP_MAX = 5
    OP_MIN = 6

    accsstring = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', 'OP_MAX', 'OP_MIN']

    any_soa = 0
    for nk in range(0, len(kernels)):
        any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

    for nk in range(0, len(kernels)):

        name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
                ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
                unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

        optidxs = [0] * nargs
        indopts = [-1] * nargs
        nopts = 0
        for i in range(0, nargs):
            if optflags[i] == 1 and maps[i] == OP_ID:
                optidxs[i] = nopts
                nopts = nopts + 1
            elif optflags[i] == 1 and maps[i] == OP_MAP:
                if i == invinds[
                        inds[i] -
                        1]:  #i.e. I am the first occurence of this dat+map combination
                    optidxs[i] = nopts
                    indopts[inds[i] - 1] = i
                    nopts = nopts + 1
                else:
                    optidxs[i] = optidxs[invinds[inds[i] - 1]]

#
# set two logicals
#
        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_MAP and accs[i] == OP_INC:
                j = i
        ind_inc = j >= 0

        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
                j = i
        reduct = j >= 0

        ##########################################################################
        #  start with the user kernel function
        ##########################################################################

        FORTRAN = 0
        CPP = 1
        g_m = 0
        file_text = ''
        depth = 0

        file_name = decl_filepath
        f = open(file_name, 'r')
        kernel_text = f.read()
        f.close()

        comm('user function')

        if CPP:
            includes = op2_gen_common.extract_includes(kernel_text)
            if len(includes) > 0:
                for include in includes:
                    code(include)
                code("")

        #strides for SoA
        if any_soa:
            if nmaps > 0:
                k = []
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
                        k = k + [mapnames[g_m]]
                        code('int opDat' + str(invinds[inds[g_m] - 1]) + '_' +
                             name + '_stride_OP2CONSTANT;')
                        code('int opDat' + str(invinds[inds[g_m] - 1]) + '_' +
                             name + '_stride_OP2HOST=-1;')
            dir_soa = -1
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID and ((not dims[g_m].isdigit())
                                           or int(dims[g_m]) > 1):
                    code('int direct_' + name + '_stride_OP2CONSTANT;')
                    code('int direct_' + name + '_stride_OP2HOST=-1;')
                    dir_soa = g_m
                    break

        comm('user function')

        kernel_text = op2_gen_common.comment_remover(kernel_text)
        kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

        p = re.compile('void\\s+\\b' + name + '\\b')
        i = p.search(kernel_text).start()

        if (i < 0):
            print "\n********"
            print "Error: cannot locate user kernel function name: " + name + " - Aborting code generation"
            exit(2)
        i2 = i

        #i = kernel_text[0:i].rfind('\n') #reverse find
        j = kernel_text[i:].find('{')
        k = op2_gen_common.para_parse(kernel_text, i + j, '{', '}')
        signature_text = kernel_text[i:i + j]
        l = signature_text[0:].find('(')
        head_text = signature_text[0:l].strip()  #save function name
        m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
        signature_text = signature_text[l + 1:m]
        body_text = kernel_text[i + j + 1:k]

        # check for number of arguments
        if len(signature_text.split(',')) != nargs_novec:
            print 'Error parsing user kernel(' + name + '): must have ' + str(
                nargs_novec) + ' arguments'
            return

        for i in range(0, nargs_novec):
            var = signature_text.split(',')[i].strip()
            if kernels[nk]['soaflags'][i]:
                var = var.replace('*', '')
                #locate var in body and replace by adding [idx]
                length = len(re.compile('\\s+\\b').split(var))
                var2 = re.compile('\\s+\\b').split(var)[length - 1].strip()

                if int(kernels[nk]['idxs']
                       [i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
                    body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \
                            op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
                else:
                    body_text = re.sub('\*\\b' + var2 + '\\b\\s*(?!\[)',
                                       var2 + '[0]', body_text)
                    body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \
                            op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)

        head_text += "_openacc"
        signature_text = '//#pragma acc routine\ninline ' + head_text + '( ' + signature_text + ') {'
        file_text += signature_text + body_text + '}\n'

        ##########################################################################
        # then C++ stub function
        ##########################################################################

        code('')
        comm(' host stub function')
        code('void op_par_loop_' + name + '(char const *name, op_set set,')
        depth += 2

        for m in unique_args:
            g_m = m - 1
            if m == unique_args[len(unique_args) - 1]:
                code('op_arg <ARG>){')
                code('')
            else:
                code('op_arg <ARG>,')

        for g_m in range(0, nargs):
            if maps[g_m] == OP_GBL:  #and accs[g_m] <> OP_READ:
                code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;')

        code('int nargs = ' + str(nargs) + ';')
        code('op_arg args[' + str(nargs) + '];')
        code('')

        for g_m in range(0, nargs):
            u = [
                i for i in range(0, len(unique_args))
                if unique_args[i] - 1 == g_m
            ]
            if len(u) > 0 and vectorised[g_m] > 0:
                code('<ARG>.idx = 0;')
                code('args[' + str(g_m) + '] = <ARG>;')

                v = [
                    int(vectorised[i] == vectorised[g_m])
                    for i in range(0, len(vectorised))
                ]
                first = [i for i in range(0, len(v)) if v[i] == 1]
                first = first[0]
                if (optflags[g_m] == 1):
                    argtyp = 'op_opt_arg_dat(arg' + str(first) + '.opt, '
                else:
                    argtyp = 'op_arg_dat('

                FOR('v', '1', str(sum(v)))
                code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
                str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
                ENDFOR()
                code('')
            elif vectorised[g_m] > 0:
                pass
            else:
                code('args[' + str(g_m) + '] = <ARG>;')

#
# start timing
#
        code('')
        comm(' initialise timers')
        code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
        code('op_timing_realloc(' + str(nk) + ');')
        code('op_timers_core(&cpu_t1, &wall_t1);')
        code('OP_kernels[' + str(nk) + '].name      = name;')
        code('OP_kernels[' + str(nk) + '].count    += 1;')
        code('')

        #
        #   indirect bits
        #
        if ninds > 0:
            code('int  ninds   = ' + str(ninds) + ';')
            line = 'int  inds[' + str(nargs) + '] = {'
            for m in range(0, nargs):
                line += str(inds[m] - 1) + ','
            code(line[:-1] + '};')
            code('')

            IF('OP_diags>2')
            code('printf(" kernel routine with indirection: ' + name +
                 '\\n");')
            ENDIF()

            code('')
            comm(' get plan')
            code('#ifdef OP_PART_SIZE_' + str(nk))
            code('  int part_size = OP_PART_SIZE_' + str(nk) + ';')
            code('#else')
            code('  int part_size = OP_part_size;')
            code('#endif')
            code('')
            code(
                'int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
        else:
            code('')
            IF('OP_diags>2')
            code('printf(" kernel routine w/o indirection:  ' + name + '");')
            ENDIF()
            code('')
            code('op_mpi_halo_exchanges_cuda(set, nargs, args);')

        code('')
        for g_m in range(0, nargs):
            if maps[g_m] == OP_GBL:  #and accs[g_m]<>OP_READ:
                if not dims[g_m].isdigit() or int(dims[g_m]) > 1:
                    print 'ERROR: OpenACC does not support multi-dimensional op_arg_gbl variables'
                    exit(-1)
                code('<TYP> <ARG>_l = <ARG>h[0];')

        if ninds > 0:
            code('')
            code('int ncolors = 0;')
        code('')
        IF('set->size >0')
        code('')
        #managing constants
        if any_soa:
            if nmaps > 0:
                k = []
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
                        k = k + [mapnames[g_m]]
                        IF('(OP_kernels[' + str(nk) + '].count==1) || (opDat' +
                           str(invinds[inds[g_m] - 1]) + '_' + name +
                           '_stride_OP2HOST != getSetSizeFromOpArg(&arg' +
                           str(g_m) + '))')
                        code('opDat' + str(invinds[inds[g_m] - 1]) + '_' +
                             name +
                             '_stride_OP2HOST = getSetSizeFromOpArg(&arg' +
                             str(g_m) + ');')
                        code('opDat' + str(invinds[inds[g_m] - 1]) + '_' +
                             name + '_stride_OP2CONSTANT = opDat' +
                             str(invinds[inds[g_m] - 1]) + '_' + name +
                             '_stride_OP2HOST;')
                        ENDIF()
            if dir_soa <> -1:
                IF('(OP_kernels[' + str(nk) + '].count==1) || (direct_' +
                   name + '_stride_OP2HOST != getSetSizeFromOpArg(&arg' +
                   str(dir_soa) + '))')
                code('direct_' + name +
                     '_stride_OP2HOST = getSetSizeFromOpArg(&arg' +
                     str(dir_soa) + ');')
                code('direct_' + name + '_stride_OP2CONSTANT = direct_' +
                     name + '_stride_OP2HOST;')
                ENDIF()
        code('')
        comm('Set up typed device pointers for OpenACC')
        if nmaps > 0:
            k = []
            for g_m in range(0, nargs):
                if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m] - 1]
                                            in k):
                    k = k + [invmapinds[inds[g_m] - 1]]
                    code('int *map' + str(mapinds[g_m]) + ' = arg' +
                         str(invmapinds[inds[g_m] - 1]) + '.map_data_d;')

        code('')
        for g_m in range(0, nargs):
            if maps[g_m] == OP_ID:
                code(typs[g_m] + '* data' + str(g_m) + ' = (' + typs[g_m] +
                     '*)arg' + str(g_m) + '.data_d;')

        for m in range(1, ninds + 1):
            g_m = invinds[m - 1]
            code('<TYP> *data' + str(g_m) + ' = (<TYP> *)<ARG>.data_d;')

#
# kernel call for indirect version
#
        if ninds > 0:
            code('')
            code(
                'op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);'
            )
            code('ncolors = Plan->ncolors;')
            code('int *col_reord = Plan->col_reord;')
            code('int set_size1 = set->size + set->exec_size;')
            code('')
            comm(' execute plan')
            FOR('col', '0', 'Plan->ncolors')
            IF('col==1')
            code('op_mpi_wait_all_cuda(nargs, args);')
            ENDIF()
            code('int start = Plan->col_offsets[0][col];')
            code('int end = Plan->col_offsets[0][col+1];')
            code('')
            #      code('#pragma omp parallel for')
            line = '#pragma acc parallel loop independent deviceptr(col_reord,'
            if nmaps > 0:
                k = []
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m] - 1]
                                                in k):
                        k = k + [invmapinds[inds[g_m] - 1]]
                        line = line + 'map' + str(mapinds[g_m]) + ','

            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + 'data' + str(g_m) + ','
            for m in range(1, ninds + 1):
                g_m = invinds[m - 1]
                line = line + 'data' + str(g_m) + ','
            line = line[:-1] + ')'

            if reduct:
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ and accs[
                            g_m] <> OP_WRITE:
                        if accs[g_m] == OP_INC:
                            line = line + ' reduction(+:arg' + str(g_m) + '_l)'
                        if accs[g_m] == OP_MIN:
                            line = line + ' reduction(min:arg' + str(
                                g_m) + '_l)'
                        if accs[g_m] == OP_MAX:
                            line = line + ' reduction(max:arg' + str(
                                g_m) + '_l)'
            code(line)
            FOR('e', 'start', 'end')
            code('int n = col_reord[e];')
            if nmaps > 0:
                k = []
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
                        k = k + [mapinds[g_m]]
                        code('int map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\
                          '[n + set_size1 * '+str(idxs[g_m])+'];')

            code('')
            for g_m in range(0, nargs):
                u = [
                    i for i in range(0, len(unique_args))
                    if unique_args[i] - 1 == g_m
                ]
                if len(u) > 0 and vectorised[g_m] > 0:
                    if accs[g_m] == OP_READ:
                        line = 'const <TYP>* <ARG>_vec[] = {\n'
                    else:
                        line = '<TYP>* <ARG>_vec[] = {\n'

                    v = [
                        int(vectorised[i] == vectorised[g_m])
                        for i in range(0, len(vectorised))
                    ]
                    first = [i for i in range(0, len(v)) if v[i] == 1]
                    first = first[0]

                    indent = ' ' * (depth + 2)
                    for k in range(0, sum(v)):
                        if soaflags[g_m]:
                            line = line + indent + ' &data' + str(
                                first) + '[map' + str(
                                    mapinds[g_m + k]) + 'idx],\n'
                        else:
                            line = line + indent + ' &data' + str(
                                first) + '[<DIM> * map' + str(
                                    mapinds[g_m + k]) + 'idx],\n'
                    line = line[:-2] + '};'
                    code(line)
            code('')
            line = name + '_openacc('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    if soaflags[g_m]:
                        line = line + indent + '&data' + str(g_m) + '[n]'
                    else:
                        line = line + indent + '&data' + str(g_m) + '[' + str(
                            dims[g_m]) + ' * n]'
                if maps[g_m] == OP_MAP:
                    if vectorised[g_m]:
                        if g_m + 1 in unique_args:
                            line = line + indent + 'arg' + str(g_m) + '_vec'
                    else:
                        if soaflags[g_m]:
                            line = line + indent + '&data' + str(
                                invinds[inds[g_m] - 1]) + '[map' + str(
                                    mapinds[g_m]) + 'idx]'
                        else:
                            line = line + indent + '&data' + str(
                                invinds[inds[g_m] - 1]) + '[' + str(
                                    dims[g_m]) + ' * map' + str(
                                        mapinds[g_m]) + 'idx]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '&arg' + str(g_m) + '_l'
                if g_m < nargs - 1:
                    if g_m + 1 in unique_args and not g_m + 1 == unique_args[
                            -1]:
                        line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()
            code('')

            if reduct:
                comm(' combine reduction data')
                IF('col == Plan->ncolors_owned-1')
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
                        if accs[g_m] == OP_INC or accs[g_m] == OP_WRITE:
                            code('<ARG>h[0] = <ARG>_l;')
                        elif accs[g_m] == OP_MIN:
                            code('<ARG>h[0]  = MIN(<ARG>h[0],<ARG>_l);')
                            ENDFOR()
                        elif accs[g_m] == OP_MAX:
                            code('<ARG>h[0]  = MAX(<ARG>h[0],<ARG>_l);')
                        else:
                            error('internal error: invalid reduction option')
                        ENDFOR()
                ENDIF()
            ENDFOR()

#
# kernel call for direct version
#
        else:
            line = '#pragma acc parallel loop independent deviceptr('
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    line = line + 'data' + str(g_m) + ','
            line = line[:-1] + ')'

            if reduct:
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ and accs[
                            g_m] <> OP_WRITE:
                        if accs[g_m] == OP_INC:
                            line = line + ' reduction(+:arg' + str(g_m) + '_l)'
                        if accs[g_m] == OP_MIN:
                            line = line + ' reduction(min:arg' + str(
                                g_m) + '_l)'
                        if accs[g_m] == OP_MAX:
                            line = line + ' reduction(max:arg' + str(
                                g_m) + '_l)'
            code(line)
            FOR('n', '0', 'set->size')
            line = name + '_openacc('
            indent = '\n' + ' ' * (depth + 2)
            for g_m in range(0, nargs):
                if maps[g_m] == OP_ID:
                    if soaflags[g_m]:
                        line = line + indent + '&data' + str(g_m) + '[n]'
                    else:
                        line = line + indent + '&data' + str(g_m) + '[' + str(
                            dims[g_m]) + '*n]'
                if maps[g_m] == OP_GBL:
                    line = line + indent + '&arg' + str(g_m) + '_l'
                if g_m < nargs - 1:
                    line = line + ','
                else:
                    line = line + ');'
            code(line)
            ENDFOR()

        if ninds > 0:
            code('OP_kernels[' + str(nk) + '].transfer  += Plan->transfer;')
            code('OP_kernels[' + str(nk) + '].transfer2 += Plan->transfer2;')

        ENDIF()
        code('')

        #zero set size issues
        if ninds > 0:
            IF('set_size == 0 || set_size == set->core_size || ncolors == 1')
            code('op_mpi_wait_all_cuda(nargs, args);')
            ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
        comm(' combine reduction data')
        for g_m in range(0, nargs):
            if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
                if ninds == 0:  #direct version only
                    if accs[g_m] == OP_INC or accs[g_m] == OP_WRITE:
                        code('<ARG>h[0] = <ARG>_l;')
                    elif accs[g_m] == OP_MIN:
                        code('<ARG>h[0]  = MIN(<ARG>h[0],<ARG>_l);')
                    elif accs[g_m] == OP_MAX:
                        code('<ARG>h[0]  = MAX(<ARG>h[0],<ARG>_l);')
                    else:
                        print 'internal error: invalid reduction option'
                if typs[g_m] == 'double':  #need for both direct and indirect
                    code('op_mpi_reduce_double(&<ARG>,<ARG>h);')
                elif typs[g_m] == 'float':
                    code('op_mpi_reduce_float(&<ARG>,<ARG>h);')
                elif typs[g_m] == 'int':
                    code('op_mpi_reduce_int(&<ARG>,<ARG>h);')
                else:
                    print 'Type ' + typs[
                        g_m] + ' not supported in OpenACC code generator, please add it'
                    exit(-1)

        code('op_mpi_set_dirtybit_cuda(nargs, args);')
        code('')

        #
        # update kernel record
        #

        comm(' update kernel record')
        code('op_timers_core(&cpu_t2, &wall_t2);')
        code('OP_kernels[' + str(nk) + '].time     += wall_t2 - wall_t1;')

        if ninds == 0:
            line = 'OP_kernels[' + str(nk) + '].transfer += (float)set->size *'

            for g_m in range(0, nargs):
                if optflags[g_m] == 1:
                    IF('<ARG>.opt')
                if maps[g_m] <> OP_GBL:
                    if accs[g_m] == OP_READ:
                        code(line + ' <ARG>.size;')
                    else:
                        code(line + ' <ARG>.size * 2.0f;')
                if optflags[g_m] == 1:
                    ENDIF()
        depth -= 2
        code('}')

        ##########################################################################
        #  output individual kernel file
        ##########################################################################
        if not os.path.exists('openacc'):
            os.makedirs('openacc')
        fid = open('openacc/' + name + '_acckernel.c', 'w')
        date = datetime.datetime.now()
        fid.write('//\n// auto-generated by op2.py\n//\n\n')
        fid.write(file_text)
        fid.close()


# end of main kernel call loop

##########################################################################
#  output one master kernel file
##########################################################################

    file_text = ''

    comm(' global constants       ')

    for nc in range(0, len(consts)):
        if not consts[nc]['user_declared']:
            if consts[nc]['dim'] == 1:
                code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                     consts[nc]['name'] + ';')
            else:
                if consts[nc]['dim'] > 0:
                    num = str(consts[nc]['dim'])
                else:
                    num = 'MAX_CONST_SIZE'

                code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                     consts[nc]['name'] + '[' + num + '];')
    code('')

    comm(' header                 ')

    if os.path.exists('./user_types.h'):
        code('#include "../user_types.h"')
    code('#include "op_lib_c.h"       ')
    code('')

    code('void op_decl_const_char(int dim, char const *type,')
    code('int size, char *dat, char const *name){}')

    comm(' user kernel files')

    for nk in range(0, len(kernels)):
        code('#include "' + kernels[nk]['name'] + '_acckernel.c"')
    master = master.split('.')[0]
    fid = open('openacc/' + master.split('.')[0] + '_acckernels.c', 'w')
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()
示例#9
0
def op2_gen_openmp4(master, date, consts, kernels):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]
  op2_compiler = os.getenv('OP2_COMPILER','0');
  any_soa = 0
  maptype = 'map'
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
            ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
            unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]
#
# set two logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ:
        j = i
    reduct = j >= 0

##########################################################################
#  start with the user kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

    comm('user function')

    #strides for SoA
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;')
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;')
      dir_soa = -1
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1):
          code('int direct_'+name+'_stride_OP2CONSTANT;')
          code('int direct_'+name+'_stride_OP2HOST=-1;')
          dir_soa = g_m
          break

    comm('user function')
    file_name = decl_filepath

    f = open(file_name, 'r')
    kernel_text = f.read()
    f.close()

    kernel_text = op2_gen_common.comment_remover(kernel_text)
    kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

    p = re.compile('void\\s+\\b'+name+'\\b')
    i = p.search(kernel_text).start()

    if(i < 0):
      print "\n********"
      print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation"
      exit(2)
    i2 = i

    #i = kernel_text[0:i].rfind('\n') #reverse find
    j = kernel_text[i:].find('{')
    k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}')
    signature_text = kernel_text[i:i+j]
    l = signature_text[0:].find('(')
    head_text = signature_text[0:l] #save function name
    m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
    signature_text = signature_text[l+1:m]
    body_text = kernel_text[i+j+1:k]

    ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>:
    body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master))

    # check for number of arguments
    if len(signature_text.split(',')) != nargs_novec:
        print 'Error parsing user kernel(%s): must have %d arguments' \
              % name, nargs
        return

    for i in range(0,nargs_novec):
        var = signature_text.split(',')[i].strip()
        if kernels[nk]['soaflags'][i]:
          var = var.replace('*','')
          #locate var in body and replace by adding [idx]
          length = len(re.compile('\\s+\\b').split(var))
          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()

          if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
            body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \
                op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
          else:
            body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
            body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \
                op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)

    for nc in range(0,len(consts)): 
      varname = consts[nc]['name']
      body_text = re.sub('\\b'+varname+'\\b', varname+'_ompkernel',body_text)
#      if consts[nc]['dim'] == 1:
#        body_text = re.sub(varname+'(?!\w)', varname+'_ompkernel', body_text)
#      else:
#        body_text = re.sub('\*'+varname+'(?!\[)', varname+'[0]', body_text)
#        body_text = re.sub(r''+varname+'\[([A-Za-z0-9]*)\]'+'', varname+r'_ompkernel[\1]', body_text)

    vec = 0
    for n in range(0,nargs):
      if (vectorised[n] == 1):
        vec = 1
    kernel_params = [ var.strip() for var in signature_text.split(',')]
    if vec:
      new_kernel_params = []
      for m in range(0,nargs_novec):
        if int(kernels[nk]['idxs'][m])<0 and int(kernels[nk]['maps'][m]) == OP_MAP:
          new_kernel_params = new_kernel_params + [kernel_params[m]]*int(-1*int(kernels[nk]['idxs'][m]))
        else:
          new_kernel_params = new_kernel_params + [kernel_params[m]]
      kernel_params = new_kernel_params

    # collect constants used by kernel
    kernel_consts = []
    for nc in range(0,len(consts)):
      if body_text.find(consts[nc]['name']+'_ompkernel') != -1:
        kernel_consts.append(nc)

############################################################
#  omp4 function call definition
############################################################
    code('')
    func_call_signaure_text = 'void ' + name + '_omp4_kernel('
    params = ''
    indent = '\n' + '  '
    k = []
    for g_m in range(0, nargs):
      if maps[g_m] == OP_GBL:
        params += indent + rep('<TYP> *<ARG>,',g_m)
      if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
        k = k + [invmapinds[inds[g_m]-1]]
        params += indent +  'int *map'+str(mapinds[g_m])+','
        if maptype == 'map':
          params += indent +  'int map'+str(mapinds[g_m])+'size,'
      if maps[g_m] == OP_ID:
        params += indent + rep('<TYP> *data'+str(g_m)+',', g_m)
        if maptype == 'map':
          params += indent +  'int dat'+str(g_m)+'size,'
    for m in range(1,ninds+1):
      g_m = invinds[m-1]
      params += indent + rep('<TYP> *data'+str(g_m)+',', g_m)
      if maptype == 'map':
        params += indent +  'int dat'+str(g_m)+'size,'
    if ninds>0:
      # add indirect kernel specific params to kernel func call 
      params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,'
    else:
      # add direct kernel specific params to kernel func call 
      params += indent + 'int count,'
    params += indent + 'int num_teams,' + indent + 'int nthread'
    #add strides for SoA to params
    if any_soa:
      indent = ','+indent
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            params += indent + 'int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT'
      
      if dir_soa<>-1:
        params += indent + 'int direct_'+name+'_stride_OP2CONSTANT'
    if nopts>0:
      params += ', int optflags'
    code(func_call_signaure_text+params+');')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm(' host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg <ARG>){');
        code('')
      else:
        code('op_arg <ARG>,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL: #and accs[g_m] <> OP_READ:
        code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('<ARG>.idx = 0;')
        code('args['+str(g_m)+'] = <ARG>;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')
      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = <ARG>;')

    if nopts>0:
      code('int optflags = 0;')
      for i in range(0,nargs):
        if optflags[i] == 1:
          IF('args['+str(i)+'].opt')
          code('optflags |= 1<<'+str(optidxs[i])+';')
          ENDIF()
    if nopts > 30:
      print 'ERROR: too many optional arguments to store flags in an integer'
#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('')

#
#   indirect bits
#
    if ninds>0:
      code('int  ninds   = '+str(ninds)+';')
      line = 'int  inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()
      code('')
      comm(' get plan')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# get part and block size
#
    code('')
    code('#ifdef OP_PART_SIZE_'+ str(nk))
    code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
    code('#else')
    code('  int part_size = OP_part_size;')
    code('#endif')
    code('#ifdef OP_BLOCK_SIZE_'+ str(nk))
    code('  int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
    code('#else')
    code('  int nthread = OP_block_size;')
    code('#endif')

    code('')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL: #and accs[g_m]<>OP_READ:
        if not dims[g_m].isdigit() or int(dims[g_m]) > 1:
          print 'ERROR: OpenMP 4 does not support multi-dimensional variables'
          exit(-1)
        code('<TYP> <ARG>_l = <ARG>h[0];')

    if ninds > 0:
      code('')
      code('int ncolors = 0;')
      code('int set_size1 = set->size + set->exec_size;')
    code('')
    IF('set_size >0')
    #managing constants
    if any_soa:
      code('')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST;')
            ENDIF()
      if dir_soa<>-1:
          IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))')
          code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');')
          code('direct_'+name+'_stride_OP2CONSTANT = direct_'+name+'_stride_OP2HOST;')
          ENDIF()
    code('')
    comm('Set up typed device pointers for OpenMP')
    if nmaps > 0:
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
          k = k + [invmapinds[inds[g_m]-1]]
          code('int *map'+str(mapinds[g_m])+' = arg'+str(invmapinds[inds[g_m]-1])+'.map_data_d;')
          if maptype == 'map':
            code(' int map'+str(mapinds[g_m])+'size = arg'+str(invmapinds[inds[g_m]-1])+'.map->dim * set_size1;') 

    code('')
    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        code(typs[g_m]+'* data'+str(g_m)+' = ('+typs[g_m]+'*)arg'+str(g_m)+'.data_d;')
        if maptype == 'map':
          if optflags[g_m]:
              code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')
          else:
            code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')

    for m in range(1,ninds+1):
      g_m = invinds[m-1]
      code('<TYP> *data'+str(g_m)+' = (<TYP> *)<ARG>.data_d;')
      if maptype == 'map':
        if optflags[g_m]:
            code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')
        else:
          code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;')
    
#
# prepare kernel params for indirect version
#
    if ninds>0:
      code('')
      code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);')
      code('ncolors = Plan->ncolors;')
      code('int *col_reord = Plan->col_reord;')
      code('')
      comm(' execute plan')
      FOR('col','0','Plan->ncolors')
      IF('col==1')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()
      code('int start = Plan->col_offsets[0][col];')
      code('int end = Plan->col_offsets[0][col+1];')
      code('')
#
# kernel function call
#
    indent = '\n' + ' ' * (depth+2)
    call_params = ','.join([ indent + re.sub(r'\*arg(\d+)',r'&arg\1_l',param.strip().split(' ')[-1]) for param in params.split(',')])
    call_params = call_params.replace('*','')
    # set params for indirect version
    if ninds>0:
      call_params = call_params.replace('num_teams','part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread')
    # set params for direct version
    else:
      call_params = re.sub('count','set->size',call_params);
      call_params = call_params.replace('num_teams','part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread') 
    code(func_call_signaure_text.split(' ')[-1]+call_params+');')
    code('')

    if ninds>0:
      if reduct:
        comm(' combine reduction data')
        IF('col == Plan->ncolors_owned-1')
        for g_m in range(0,nargs):
          if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
            if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE:
              code('<ARG>h[0] = <ARG>_l;')
            elif accs[g_m]==OP_MIN:
              code('<ARG>h[0]  = MIN(<ARG>h[0],<ARG>_l);')
            elif  accs[g_m]==OP_MAX:
              code('<ARG>h[0]  = MAX(<ARG>h[0],<ARG>_l);')
            else:
              error('internal error: invalid reduction option')
        ENDIF()
      ENDFOR()
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')

    ENDIF()
    code('')

    #zero set size issues
    if ninds>0:
      IF('set_size == 0 || set_size == set->core_size || ncolors == 1')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
    comm(' combine reduction data')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
        if ninds==0: #direct version only
          if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE:
            code('<ARG>h[0] = <ARG>_l;')
          elif accs[g_m]==OP_MIN:
            code('<ARG>h[0]  = MIN(<ARG>h[0],<ARG>_l);')
          elif accs[g_m]==OP_MAX:
            code('<ARG>h[0]  = MAX(<ARG>h[0],<ARG>_l);')
          else:
            print 'internal error: invalid reduction option'
        if typs[g_m] == 'double': #need for both direct and indirect
          code('op_mpi_reduce_double(&<ARG>,<ARG>h);')
        elif typs[g_m] == 'float':
          code('op_mpi_reduce_float(&<ARG>,<ARG>h);')
        elif typs[g_m] == 'int':
          code('op_mpi_reduce_int(&<ARG>,<ARG>h);')
        else:
          print 'Type '+typs[g_m]+' not supported in OpenMP4 code generator, please add it'
          exit(-1)


    code('op_mpi_set_dirtybit_cuda(nargs, args);')
    code('')

#
# update kernel record
#

    code('if (OP_diags>1) deviceSync();')
    comm(' update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('<ARG>.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' <ARG>.size;')
          else:
            code(line+' <ARG>.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()

    depth -= 2
    code('}')

##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('openmp4'):
        os.makedirs('openmp4')
    fid = open('openmp4/'+name+'_omp4kernel.cpp','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

##############################################################
# generate ****_omp4kernel_func.cpp
##############################################################
    file_text = ''

    if CPP:
      includes = op2_gen_common.extract_includes(kernel_text)
      if len(includes) > 0:
        for include in includes:
          code(include)
        code("")

    code(func_call_signaure_text+params+'){')
    code('')
    depth += 2
    for g_m in range(0, nargs):
      if maps[g_m] == OP_GBL:
        code('<TYP> <ARG>_l = *<ARG>;')
    line = '#pragma omp target teams'
    if op2_compiler == 'clang':
      line +=' distribute parallel for schedule(static,1)\\\n' + (depth+2)*' '
    line +=' num_teams(num_teams) thread_limit(nthread) '
    map_clause = ''
    if maptype == 'map':
      map_clause = 'map(to:'
    elif maptype == 'is_device_ptr':
      map_clause = 'is_device_ptr('
       
    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        if maptype == 'map':
          map_clause += 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],'
        else:
          map_clause += 'data'+str(g_m)+','
    if map_clause != 'is_device_ptr(' and map_clause != 'map(to:':
      map_clause = map_clause[:-1]+')'
      line += map_clause
    # mapping global consts
    if len(kernel_consts) != 0:
      line += ' \\\n' + (depth+2)*' ' + 'map(to:'
      for nc in kernel_consts:
        line += ' ' + consts[nc]['name']+'_ompkernel,'
        if consts[nc]['dim'] != 1:
          if consts[nc]['dim'] > 0:
            num = str(consts[nc]['dim'])
          else:
            num = 'MAX_CONST_SIZE'
          line = line[:-1] + '[:'+ num +'],'
      line = line[:-1]+')'
    # prepare reduction
    reduction_string = ''
    reduction_mapping = ''
    if reduct:
      reduction_mapping ='\\\n'+(depth+2)*' '+ 'map(tofrom:'
      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
          if accs[g_m] == OP_INC:
            reduction_string += ' reduction(+:arg%d_l)' % g_m
            reduction_mapping += ' arg%d_l,' % g_m
          if accs[g_m] == OP_MIN:
            reduction_string += ' reduction(min:arg%d_l)' % g_m
            reduction_mapping += ' arg%d_l,' % g_m
          if accs[g_m] == OP_MAX:
            reduction_string += ' reduction(max:arg%d_l)' % g_m
            reduction_mapping += ' arg%d_l,' % g_m
          if accs[g_m] == OP_WRITE:
            reduction_mapping += ' arg%d_l,' % g_m
      reduction_mapping = reduction_mapping[0:-1]+')' 
#
# map extra pointers for indirect version
#
    if ninds>0:
      if maptype == 'map':
        line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],'
      else:
        line += '\\\n'+(depth+2)*' '+'map(to:col_reord,'
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k):
            k = k + [invmapinds[inds[g_m]-1]]
            if maptype == 'map':
              line = line + 'map'+str(mapinds[g_m])+'[0:map'+str(mapinds[g_m])+'size],'
            else:
              line = line + 'map'+str(mapinds[g_m])+','
      for m in range(1,ninds+1):
        g_m = invinds[m-1]
        if maptype == 'map':
          line = line + 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],'
        else:
          line = line + 'data'+str(g_m)+','
      line = line[:-1]+')'
#
# write omp pragma
#
    code(line + reduction_mapping + reduction_string)
    if op2_compiler != 'clang':
      line = '#pragma omp distribute parallel for schedule(static,1)'
      code(line + reduction_string)
#
# start for loop indirect version
#
    if ninds>0:
      FOR('e','start','end')
      code('int n_op = col_reord[e];')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')
      #do non-optional ones
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]):
            k = k + [mapinds[g_m]]
            code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\
              '[n_op + set_size1 * '+str(idxs[g_m])+'];')
      #do optional ones
      if nmaps > 0:
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            if optflags[g_m]:
              IF('optflags & 1<<'+str(optidxs[g_m]))
            else:
              k = k + [mapinds[g_m]]
            code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\
              '[n_op + set_size1 * '+str(idxs[g_m])+'];')
            if optflags[g_m]:
              ENDIF()

      code('')
      for g_m in range (0,nargs):
        u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
        if len(u) > 0 and vectorised[g_m] > 0:
          if accs[g_m] == OP_READ:
            line = 'const <TYP>* <ARG>_vec[] = {\n'
          else:
            line = '<TYP>* <ARG>_vec[] = {\n'

          v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
          first = [i for i in range(0,len(v)) if v[i] == 1]
          first = first[0]

          indent = ' '*(depth+2)
          for k in range(0,sum(v)):
            if soaflags[g_m]:
              line = line + indent + ' &data'+str(first)+'[map'+str(mapinds[g_m+k])+'idx],\n'
            else:
              line = line + indent + ' &data'+str(first)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n'
          line = line[:-2]+'};'
          code(line)
#
# direct version
#
    else:
      FOR('n_op','0','count')
#
# write inlined kernel function
#
    comm('variable mapping')
    for g_m in range(0,nargs):
      line = kernel_params[g_m] + ' = '
      if maps[g_m] == OP_ID:
        if soaflags[g_m]:
          line += '&data%d[n_op]' % g_m
        else:
          line += '&data'+str(g_m)+'['+str(dims[g_m])+'*n_op]'
      if maps[g_m] == OP_MAP:
        if vectorised[g_m]:
          if g_m+1 in unique_args:
            line += 'arg'+str(g_m)+'_vec'
          else:
            line = ''
        else:
          if soaflags[g_m]:
            line += '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]'
          else:
            line += '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
      if maps[g_m] == OP_GBL:
        line += '&arg%d_l' % g_m
      if len(line):
        line += ';'
        code(line)
    
    code('')
    comm('inline function')
    indent = ' ' * (depth-2)
    inline_body_text = ''
    for line in body_text.split('\n'):
      if len(line):
        inline_body_text += indent+line+'\n'
      else:
        inline_body_text += '\n'
    code(inline_body_text)
    comm('end inline func')

    ENDFOR()
    code('')
    # end kernel function
    for g_m in range(0, nargs):
      if maps[g_m] == OP_GBL:
        code('*<ARG> = <ARG>_l;')
    depth -= 2;
    code('}')


##########################################################################
#  output individual omp4kernel file
##########################################################################
    fid = open('openmp4/'+name+'_omp4kernel_func.cpp','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text =''
  comm(' header                 ')
  code('#include "op_lib_cpp.h"       ')
  code('')
  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_omp4kernel.cpp"')
  master = master.split('.')[0]
  fid = open('openmp4/'+master.split('.')[0]+'_omp4kernels.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()


##########################################################################
#  output omp4 master kernel file
##########################################################################

  file_text =''

  comm(' global constants       ')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel;')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'
      code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel['+num+'];')
  code('')

  comm(' header                 ')

  if os.path.exists('./user_types.h'):
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"       ')
  code('')

  code('void op_decl_const_char(int dim, char const *type,')
  code('  int size, char *dat, char const *name){')
  indent = ' ' * ( 2+ depth)
  line = '  ' 
  for nc in range (0,len(consts)):
    varname = consts[nc]['name']
    if nc > 0:
        line += ' else '
    line += 'if(!strcmp(name, "%s")) {\n' %varname + indent + 2*' ' + 'memcpy('
    if consts[nc]['dim']==1:
      line += '&'
    line += varname+ '_ompkernel, dat, dim*size);\n' + indent + '#pragma omp target enter data map(to:'+varname+'_ompkernel'
    if consts[nc]['dim'] !=1:
      line += '[:%s]' % str(consts[nc]['dim']) if consts[nc]['dim'] > 0 else 'MAX_CONST_SIZE'
    line += ')\n'+indent + '}'
  code(line)
  code('}')

  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_omp4kernel_func.cpp"')
  master = master.split('.')[0]
  fid = open('openmp4/'+master.split('.')[0]+'_omp4kernel_funcs.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
示例#10
0
def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  inc_stage=0
  op_color2=0
  op_color2_force=0
##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
           ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
           unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk], inc_stage)

    any_soa = 0
    any_soa = any_soa or sum(soaflags)
#
# set logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_RW:
        j = i
    ind_rw = j >= 0

    if ind_rw or op_color2_force:
        op_color2 = 1
    else:
        op_color2 = 0

    #no staging with 2 level colouring
    if op_color2:
      inc_stage=0

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
        j = i
    reduct = j >= 0

    if inc_stage:
      ninds_staged = 0
      inds_staged = [-1]*nargs
      for i in range(0,nargs):
        if maps[i]==OP_MAP and accs[i]==OP_INC:
          if inds_staged[invinds[inds[i]-1]] == -1:
            inds_staged[i] = ninds_staged
            ninds_staged = ninds_staged + 1
          else:
            inds_staged[i] = inds_staged[invinds[inds[i]-1]]
      invinds_staged = [-1]*ninds_staged
      inddims_staged = [-1]*ninds_staged
      indopts_staged = [-1]*ninds_staged
      for i in range(0,nargs):
        if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1:
          invinds_staged[inds_staged[i]] = i
          inddims_staged[inds_staged[i]] = dims[i]
          if optflags[i] == 1:
            indopts_staged[inds_staged[i]] = i
      for i in range(0,nargs):
        inds_staged[i] = inds_staged[i] + 1

##########################################################################
#  start with CUDA kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0


    #strides for SoA
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('__constant__ int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;')
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;')
      dir_soa = -1
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1):
          code('__constant__ int direct_'+name+'_stride_OP2CONSTANT;')
          code('int direct_'+name+'_stride_OP2HOST=-1;')
          dir_soa = g_m
          break

    file_name = decl_filepath

    f = open(file_name, 'r')
    kernel_text = f.read()
    f.close()

    if CPP:
      includes = op2_gen_common.extract_includes(kernel_text)
      if len(includes) > 0:
        for include in includes:
          code(include)
        code("")

    comm('user function')
    
    kernel_text = op2_gen_common.comment_remover(kernel_text)
    kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

    p = re.compile('void\\s+\\b'+name+'\\b')
    i = p.search(kernel_text).start()

    if(i < 0):
      print "\n********"
      print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation"
      exit(2)
    i2 = i

    #i = kernel_text[0:i].rfind('\n') #reverse find
    j = kernel_text[i:].find('{')
    k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}')
    signature_text = kernel_text[i:i+j]
    l = signature_text[0:].find('(')
    head_text = signature_text[0:l].strip() #save function name
    m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
    signature_text = signature_text[l+1:m]
    body_text = kernel_text[i+j+1:k]

    ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>:
    body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master))

    # check for number of arguments
    if len(signature_text.split(',')) != nargs_novec:
        print 'Error parsing user kernel('+name+'): must have '+str(nargs)+' arguments'
        return

    for i in range(0,nargs_novec):
        var = signature_text.split(',')[i].strip()
        if kernels[nk]['soaflags'][i] and (op_color2 or  not (kernels[nk]['maps'][i] == OP_MAP and kernels[nk]['accs'][i] == OP_INC)):
          var = var.replace('*','')
          #locate var in body and replace by adding [idx]
          length = len(re.compile('\\s+\\b').split(var))
          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()

          if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
            body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
          else:
            body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
            body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \
                               op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)

    for nc in range(0,len(consts)): 
      varname = consts[nc]['name']
      body_text = re.sub('\\b'+varname+'\\b', varname+'_cuda',body_text)

    signature_text = '__device__ '+head_text + '_gpu( '+signature_text + ') {'
    file_text += signature_text + body_text + '}\n'

    comm('')
    comm(' CUDA kernel function')

    if FORTRAN:
      code('subroutine op_cuda_'+name+'(')
    elif CPP:
      code('__global__ void op_cuda_'+name+'(')

    depth = 2

    if nopts > 0:
      code('int optflags,')
      
    for g_m in range(0,ninds):
      if (indaccs[g_m]==OP_READ):
        code('const <INDTYP> *__restrict <INDARG>,')
      else:
        code('<INDTYP> *__restrict <INDARG>,')

    if nmaps > 0:
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
          k = k + [mapnames[g_m]]
          code('const int *__restrict opDat'+str(invinds[inds[g_m]-1])+'Map, ')



    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        if accs[g_m] == OP_READ:
          code('const <TYP> *__restrict <ARG>,')
        else:
          code('<TYP> *<ARG>,')
      elif maps[g_m] == OP_GBL:
        if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE:
          code('<TYP> *<ARG>,')
        elif accs[g_m] == OP_READ:
          code('const <TYP> *<ARG>,')

    if ind_inc and inc_stage==1:
      code('int   *ind_map,')
      code('short *arg_map,')
      code('int   *ind_arg_sizes,')
      code('int   *ind_arg_offs, ')

    if ninds>0:
      if not op_color2:
        code('int    block_offset, ')
        code('int   *blkmap,       ')
        code('int   *offset,       ')
        code('int   *nelems,       ')
        code('int   *ncolors,      ')
        code('int   *colors,       ')
        code('int   nblocks,       ')
      else:
        code('int start,           ')
        code('int end,             ')
        code('int *col_reord,      ')
      code('int   set_size) {    ')
    else:
      code('int   set_size ) {')
      code('')


    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE:
        code('<TYP> <ARG>_l[<DIM>];')
        if accs[g_m] == OP_INC:
          FOR('d','0','<DIM>')
          code('<ARG>_l[d]=ZERO_<TYP>;')
          ENDFOR()
        else:
          FOR('d','0','<DIM>')
          code('<ARG>_l[d]=<ARG>[d+blockIdx.x*<DIM>];')
          ENDFOR()
      elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC and not op_color2:
        code('<TYP> <ARG>_l[<DIM>];')

    if not op_color2:
      for m in range (1,ninds+1):
        g_m = m -1
        v = [int(inds[i]==m) for i in range(len(inds))]
        v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m]
        if sum(v)>1 and sum(v_i)>0: #check this sum(v_i)
          if indaccs[m-1] == OP_INC:
            ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1
            code('<INDTYP> *arg'+str(invinds[m-1])+'_vec['+str(ind)+'] = {'); depth += 2;
            for n in range(0,nargs):
              if inds[n] == m:
                g_m = n
                code('<ARG>_l,')
            depth -= 2
            code('};')
#
# lengthy code for general case with indirection
#
    if ninds>0 and not op_color2:
      code('')
      if inc_stage==1:
        for g_m in range (0,ninds):
          if indaccs[g_m] == OP_INC:
            code('__shared__  int  *<INDARG>_map, <INDARG>_size;')
            code('__shared__  <INDTYP> *<INDARG>_s;')
        code('')
      if ind_inc:
        code('__shared__ int    nelems2, ncolor;')

      code('__shared__ int    nelem, offset_b;')
      code('')
      code('extern __shared__ char shared[];')
      code('')
      IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks')
      code('return;')
      ENDIF()
      IF('threadIdx.x==0')
      code('')
      comm('get sizes and shift pointers and direct-mapped data')
      code('')
      code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x  + block_offset];')
      code('')
      code('nelem    = nelems[blockId];')
      code('offset_b = offset[blockId];')
      code('')

      if ind_inc:
        code('nelems2  = blockDim.x*(1+(nelem-1)/blockDim.x);')
        code('ncolor   = ncolors[blockId];')
        code('')

      if inc_stage==1 and ind_inc:
        for g_m in range (0,ninds_staged):
          if indopts_staged[g_m-1] > 0:
            IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]]))
          code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds_staged)+'];')
          if indopts_staged[g_m-1] > 0:
            ENDIF()

        code('')
        for m in range (1,ninds_staged+1):
          g_m = m - 1
          c = [i for i in range(nargs) if inds_staged[i]==m]
          code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\
          '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds_staged)+'];')

        code('')
        comm('set shared memory pointers')
        code('int nbytes = 0;')

        for g_m in range(0,ninds_staged):
          code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_s = ('+typs[invinds_staged[g_m]]+' *) &shared[nbytes];')
          if g_m < ninds_staged-1:
            if indopts_staged[g_m-1] > 0:
              IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]]))
            code('nbytes    += ROUND_UP(ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size*sizeof('+typs[invinds_staged[g_m]]+')*'+dims[invinds_staged[g_m]]+');')
            if indopts_staged[g_m-1] > 0:
              ENDIF()


      ENDIF()
      code('__syncthreads(); // make sure all of above completed')
      code('')

      if inc_stage==1:
        for g_m in range(0,ninds):
          if indaccs[g_m] == OP_INC:
            FOR_INC('n','threadIdx.x','ind_ARG_size*<INDDIM>','blockDim.x')
            code('ind_ARG_s[n] = ZERO_<INDTYP>;')
            ENDFOR()
        if ind_inc:
          code('')
          code('__syncthreads();')
          code('')

      if ind_inc:
        FOR_INC('n','threadIdx.x','nelems2','blockDim.x')
        code('int col2 = -1;')
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')
        IF('n<nelem')
        comm('initialise local variables')

        for g_m in range(0,nargs):
          if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
            FOR('d','0','<DIM>')
            code('<ARG>_l[d] = ZERO_<TYP>;')
            ENDFOR()
      else:
        FOR_INC('n','threadIdx.x','nelem','blockDim.x')
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')

      #non-optional maps
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k):
          k = k + [(0*nargs+mapinds[g_m])] #non-opt
          k = k + [(1*nargs+mapinds[g_m])] #opt
          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];')

      #whatever didn't come up and is opt
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in      k)):
          k = k + [(optflags[g_m]*nargs+mapinds[g_m])]
          if optflags[g_m]==1:
            IF('optflags & 1<<'+str(optidxs[g_m]))

          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];')
          if optflags[g_m]==1:
            ENDIF()
 
      code('')
      for g_m in range (0,nargs):
        if accs[g_m] <> OP_INC: #TODO: add opt handling here
          u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
          if len(u) > 0 and vectorised[g_m] > 0:
            if accs[g_m] == OP_READ:
              line = 'const <TYP>* <ARG>_vec[] = {\n'
            else:
              line = '<TYP>* <ARG>_vec[] = {\n'

            v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
            first = [i for i in range(0,len(v)) if v[i] == 1]
            first = first[0]

            indent = ' '*(depth+2)
            for k in range(0,sum(v)):
              if soaflags[g_m]:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n'
              else:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n'
            line = line[:-2]+'};'
            code(line)
#
# simple version for global coloring
#
    elif ninds>0:
      code('int tid = threadIdx.x + blockIdx.x * blockDim.x;')
      IF('tid + start < end')
      code('int n = col_reord[tid + start];')
      comm('initialise local variables')
      #mapidx declarations
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
          k = k + [mapinds[g_m]]
          code('int map'+str(mapinds[g_m])+'idx;')

      #non-optional maps
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k):
          k = k + [(0*nargs+mapinds[g_m])] #non-opt
          k = k + [(1*nargs+mapinds[g_m])] #opt
          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];')

      #whatever didn't come up and is opt
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in      k)):
          k = k + [(optflags[g_m]*nargs+mapinds[g_m])]
          if optflags[g_m]==1:
            IF('optflags & 1<<'+str(optidxs[g_m]))

          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];')
          if optflags[g_m]==1:
            ENDIF()

      for g_m in range (0,nargs):
          u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
          if len(u) > 0 and vectorised[g_m] > 0:
            if accs[g_m] == OP_READ:
              line = 'const <TYP>* <ARG>_vec[] = {\n'
            else:
              line = '<TYP>* <ARG>_vec[] = {\n'

            v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
            first = [i for i in range(0,len(v)) if v[i] == 1]
            first = first[0]

            indent = ' '*(depth+2)
            for k in range(0,sum(v)):
              if soaflags[g_m]:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n'
              else:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n'
            line = line[:-2]+'};'
            code(line)




#
# simple alternative when no indirection
#
    else:
      code('')
      comm('process set elements')
      FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x')

#
# kernel call
#
    code('')
    comm('user-supplied kernel call')
    line = name+'_gpu('
    prefix = ' '*len(name)
    a = 0 #only apply indentation if its not the 0th argument
    indent =''
    for m in range (0, nargs):
      if a > 0:
        indent = '     '+' '*len(name)

      if maps[m] == OP_GBL:
        if accs[m] == OP_READ or accs[m] == OP_WRITE:
          line += rep(indent+'<ARG>,\n',m)
        else:
          line += rep(indent+'<ARG>_l,\n',m);
        a =a+1
      elif maps[m]==OP_MAP and  accs[m]==OP_INC and not op_color2:
        if vectorised[m]:
          if m+1 in unique_args:
            line += rep(indent+'<ARG>_vec,\n',m)
        else:
          line += rep(indent+'<ARG>_l,\n',m)
        a =a+1
      elif maps[m]==OP_MAP:
        if vectorised[m]:
          if m+1 in unique_args:
            line += rep(indent+'<ARG>_vec,\n',m)
        else:
          if soaflags[m]:
            line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx,'+'\n',m)
          else:
            line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx*<DIM>,'+'\n',m)
        a =a+1
      elif maps[m]==OP_ID:
        if ninds>0 and not op_color2:
          if soaflags[m]:
            line += rep(indent+'<ARG>+(n+offset_b),\n',m)
          else:
            line += rep(indent+'<ARG>+(n+offset_b)*<DIM>,\n',m)
          a =a+1
        else:
          if soaflags[m]:
            line += rep(indent+'<ARG>+n,\n',m)
          else:
            line += rep(indent+'<ARG>+n*<DIM>,\n',m)
          a =a+1
      else:
        print 'internal error 1 '

    code(line[0:-2]+');') #remove final ',' and \n

#
# updating for indirect kernels ...
#
    if ninds>0 and not op_color2:
      if ind_inc:
        code('col2 = colors[n+offset_b];')
        ENDIF()
        code('')
        comm('store local variables')
        code('')
        if inc_stage==1:
          for g_m in range(0,nargs):
            if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
              code('int <ARG>_map;')
          IF('col2>=0')
          for g_m in range(0,nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              code('<ARG>_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];')
          ENDIF()
          code('')

        FOR('col','0','ncolor')
        IF('col2==col')

        if inc_stage==1:
          for g_m in range(0,nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              if optflags[g_m]==1:
                IF('optflags & 1<<'+str(optidxs[g_m]))
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s[<ARG>_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size];')
                else:
                  code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+<ARG>_map*<DIM>];')
#          for g_m in range(0,nargs):
#            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('ind_arg'+str(inds[g_m]-1)+'_s[<ARG>_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size] = <ARG>_l['+str(d)+'];')
                else:
                  code('ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+<ARG>_map*<DIM>] = <ARG>_l['+str(d)+'];')
                
              if optflags[g_m]==1:
                ENDIF()
        else:
          for g_m in range(0,nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              if optflags[g_m]==1:
                IF('optflags & 1<<'+str(optidxs[g_m]))
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx];')
                else:
                  code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>];')
#          for g_m in range(0,nargs):
#            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx] = <ARG>_l['+str(d)+'];')
                else:
                  code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>] = <ARG>_l['+str(d)+'];')
              if optflags[g_m]==1:
                ENDIF()

        ENDFOR()
        code('__syncthreads();')
        ENDFOR()
    ENDFOR()

    if inc_stage:
      for g_m in range(0,ninds):
        if indaccs[g_m]==OP_INC:
          if indopts[g_m] > 0:
            IF('optflags & 1<<'+str(optidxs[indopts[g_m-1]]))
          if soaflags[invinds[g_m]]:
            FOR_INC('n','threadIdx.x','<INDARG>_size','blockDim.x')
            for d in range(0,int(dims[invinds[g_m]])):
              code('arg'+str(invinds[g_m])+'_l['+str(d)+'] = <INDARG>_s[n+'+str(d)+'*<INDARG>_size] + <INDARG>[<INDARG>_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'];')
            for d in range(0,int(dims[invinds[g_m]])):
              code('<INDARG>[<INDARG>_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'] = arg'+str(invinds[g_m])+'_l['+str(d)+'];')
            ENDFOR()
          else:
            FOR_INC('n','threadIdx.x','<INDARG>_size*<INDDIM>','blockDim.x')
            code('<INDARG>[n%<INDDIM>+<INDARG>_map[n/<INDDIM>]*<INDDIM>] += <INDARG>_s[n];')
            ENDFOR()
          if indopts[g_m] > 0:
            ENDIF()

#
# global reduction
#
    if reduct:
       code('')
       comm('global reductions')
       code('')
       for m in range (0,nargs):
         g_m = m
         if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE:
           FOR('d','0','<DIM>')
           if accs[m]==OP_INC:
             code('op_reduction<OP_INC>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);')
           elif accs[m]==OP_MIN:
             code('op_reduction<OP_MIN>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);')
           elif accs[m]==OP_MAX:
             code('op_reduction<OP_MAX>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);')
           else:
             print 'internal error: invalid reduction option'
             sys.exit(2);
           ENDFOR()
    depth -= 2
    code('}')
    code('')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm('host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg <ARG>){')
        code('')
      else:
        code('op_arg <ARG>,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL:
        code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')


    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('<ARG>.idx = 0;')
        code('args['+str(g_m)+'] = <ARG>;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')

      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = <ARG>;')

    if nopts>0:
      code('int optflags = 0;')
      for i in range(0,nargs):
        if optflags[i] == 1:
          IF('args['+str(i)+'].opt')
          code('optflags |= 1<<'+str(optidxs[i])+';')
          ENDIF()
    if nopts > 30:
      print 'ERROR: too many optional arguments to store flags in an integer'
#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('')

#
#   indirect bits
#
    if ninds>0:
      code('')
      code('int    ninds   = '+str(ninds)+';')
      line = 'int    inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

      code('')
      comm('get plan')
      code('#ifdef OP_PART_SIZE_'+ str(nk))
      code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
      code('#else')
      code('  int part_size = OP_part_size;')
      code('#endif')
      code('')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('op_mpi_halo_exchanges_cuda(set, nargs, args);')

    IF('set->size > 0')
    code('')

#
# kernel call for indirect version
#
    if ninds>0:
      if inc_stage==1 and ind_inc:
        code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);')
      elif op_color2:
        code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);')
      else:
        code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);')
      code('')


#
# transfer constants
#
    g = [i for i in range(0,nargs) if maps[i] == OP_GBL and (accs[i] == OP_READ or accs[i] == OP_WRITE)]
    if len(g)>0:
      comm('transfer constants to GPU')
      code('int consts_bytes = 0;')
      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m] == OP_WRITE):
          code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));')

      code('reallocConstArrays(consts_bytes);')
      code('consts_bytes = 0;')

      for m in range(0,nargs):
        if maps[m]==OP_GBL and (accs[m] == OP_READ or accs[m] == OP_WRITE):
          g_m = m
          code('<ARG>.data   = OP_consts_h + consts_bytes;')
          code('<ARG>.data_d = OP_consts_d + consts_bytes;')
          FOR('d','0','<DIM>')
          code('((<TYP> *)<ARG>.data)[d] = <ARG>h[d];')
          ENDFOR()
          code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));')
      code('mvConstArraysToDevice(consts_bytes);')
      code('')

      #managing constants
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');')
            code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));')
            ENDIF()
      if dir_soa<>-1:
          IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))')
          code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');')
          code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));')
          ENDIF()

#
# transfer global reduction initial data
#

    if ninds == 0:
      comm('set CUDA execution parameters')
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('  int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      code('  int nthread = OP_block_size;')
      comm('  int nthread = 128;')
      code('#endif')
      code('')
      code('int nblocks = 200;')
      code('')

    if reduct:
      comm('transfer global reduction data to GPU')
      if ninds>0:
        code('int maxblocks = 0;')
        FOR('col','0','Plan->ncolors')
        code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);')
        ENDFOR()
      else:
        code('int maxblocks = nblocks;')

      code('int reduct_bytes = 0;')
      code('int reduct_size  = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));')
          code('reduct_size   = MAX(reduct_size,sizeof(<TYP>));')

      code('reallocReductArrays(reduct_bytes);')
      code('reduct_bytes = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('<ARG>.data   = OP_reduct_h + reduct_bytes;')
          code('<ARG>.data_d = OP_reduct_d + reduct_bytes;')
          FOR('b','0','maxblocks')
          FOR('d','0','<DIM>')
          if accs[g_m]==OP_INC:
            code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = ZERO_<TYP>;')
          else:
            code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = <ARG>h[d];')
          ENDFOR()
          ENDFOR()
          code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));')
      code('mvReductArraysToDevice(reduct_bytes);')
      code('')

#
# kernel call for indirect version
#
    if ninds>0:
      comm('execute plan')
      if not op_color2:
        code('')
        code('int block_offset = 0;')
      FOR('col','0','Plan->ncolors')
      IF('col==Plan->ncolors_core')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      code('int nthread = OP_block_size;')
      code('#endif')
      code('')
      if op_color2:
        code('int start = Plan->col_offsets[0][col];')
        code('int end = Plan->col_offsets[0][col+1];')
        code('int nblocks = (end - start - 1)/nthread + 1;')
      else:
        code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],')
        code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);')
        IF('Plan->ncolblk[col] > 0')

      if reduct or (inc_stage==1 and ind_inc):
        if reduct and inc_stage==1:
          code('int nshared = MAX(Plan->nshared,reduct_size*nthread);')
        elif reduct:
          code('int nshared = reduct_size*nthread;')
        else:
          code('int nshared = Plan->nsharedCol[col];')
        code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')
      else:
        code('op_cuda_'+name+'<<<nblocks,nthread>>>(')

      if nopts > 0:
        code('optflags,')
      for m in range(1,ninds+1):
        g_m = invinds[m-1]
        code('(<TYP> *)<ARG>.data_d,')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('arg'+str(g_m)+'.map_data_d, ')
      for g_m in range(0,nargs):
        if inds[g_m]==0:
          code('(<TYP>*)<ARG>.data_d,')

      if inc_stage==1 and ind_inc:
        code('Plan->ind_map,')
        code('Plan->loc_map,')
        code('Plan->ind_sizes,')
        code('Plan->ind_offs,')
      if op_color2:
        code('start,')
        code('end,')
        code('Plan->col_reord,')
      else:
        code('block_offset,')
        code('Plan->blkmap,')
        code('Plan->offset,')
        code('Plan->nelems,')
        code('Plan->nthrcol,')
        code('Plan->thrcol,')
        code('Plan->ncolblk[col],')
      code('set->size+set->exec_size);')
      code('')
      if reduct:
        comm('transfer global reduction data back to CPU')
        IF('col == Plan->ncolors_owned-1')
        code('mvReductArraysToHost(reduct_bytes);')
        ENDIF()
      if not op_color2:
        ENDFOR() #TODO sztem ez forditva van...
        code('block_offset += Plan->ncolblk[col];')
      ENDIF()
#
# kernel call for direct version
#
    else:
      if reduct:
        code('int nshared = reduct_size*nthread;')
        code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')
      else:
        code('op_cuda_'+name+'<<<nblocks,nthread>>>(')

      indent = '  '#*(len(name)+42)
      if nopts > 0:
        code(indent+'optflags,')
      for g_m in range(0,nargs):
        if g_m > 0:
          code(indent+'(<TYP> *) <ARG>.data_d,')
        else:
          code(indent+'(<TYP> *) <ARG>.data_d,')

      code(indent+'set->size );')

    if ninds>0:
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')


#
# transfer global reduction initial data
#
    if reduct:
      if ninds == 0:
        comm('transfer global reduction data back to CPU')
        code('mvReductArraysToHost(reduct_bytes);')

      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE:
          FOR('b','0','maxblocks')
          FOR('d','0','<DIM>')
          if accs[m]==OP_INC:
            code('<ARG>h[d] = <ARG>h[d] + ((<TYP> *)<ARG>.data)[d+b*<DIM>];')
          elif accs[m]==OP_MIN:
            code('<ARG>h[d] = MIN(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);')
          elif accs[m]==OP_MAX:
            code('<ARG>h[d] = MAX(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);')
          ENDFOR()
          ENDFOR()

          code('<ARG>.data = (char *)<ARG>h;')
          code('op_mpi_reduce(&<ARG>,<ARG>h);')
          
    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        code('')
        code('mvConstArraysToHost(consts_bytes);')
        break

    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        FOR('d','0','<DIM>')
        code('<ARG>h[d] = ((<TYP> *)<ARG>.data)[d];') 
        ENDFOR()
        code('<ARG>.data = (char *)<ARG>h;')
        code('op_mpi_reduce(&<ARG>,<ARG>h);')

    ENDIF()
    code('op_mpi_set_dirtybit_cuda(nargs, args);')

#
# update kernel record
#

    code('cutilSafeCall(cudaDeviceSynchronize());')
    comm('update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('<ARG>.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' <ARG>.size;')
          else:
            code(line+' <ARG>.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()
    depth = depth - 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('cuda'):
        os.makedirs('cuda')
    fid = open('cuda/'+name+'_kernel.cu','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text = ''

  comm('global constants')

  code('#ifndef MAX_CONST_SIZE')
  code('#define MAX_CONST_SIZE 128')
  code('#endif')
  code('')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda;')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'

      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda['+num+'];')
  code('')

  comm('header')

  if os.path.exists('./user_types.h'):
    code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif')
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"')
  code('#include "op_cuda_rt_support.h"')
  code('#include "op_cuda_reduction.h"')

  code('')
  code('void op_decl_const_char(int dim, char const *type,')
  code('int size, char *dat, char const *name){')
  depth = depth + 2

  code('if (!OP_hybrid_gpu) return;')
  for nc in range(0,len(consts)):
    IF('!strcmp(name,"'+consts[nc]['name']+'")')
    if consts[nc]['dim'] < 0:
      IF('!strcmp(name,"'+consts[nc]['name']+'") && size>MAX_CONST_SIZE) {')
      code('printf("error: MAX_CONST_SIZE not big enough\n"); exit(1);')
      ENDIF()
    code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*size));')
    ENDIF()
    code('else ')

  code('{')
  depth = depth + 2
  code('printf("error: unknown const name\\n"); exit(1);')
  ENDIF()


  depth = depth - 2
  code('}')
  code('')
  comm('user kernel files')

  for nk in range(0,len(kernels)):
    file_text = file_text +\
    '#include "'+kernels[nk]['name']+'_kernel.cu"\n'

  master = master.split('.')[0]
  fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
示例#11
0
def op2_gen_openmp(master, date, consts, kernels):

    global dims, idxs, typs, indtyps, inddims
    global FORTRAN, CPP, g_m, file_text, depth

    OP_ID = 1
    OP_GBL = 2
    OP_MAP = 3

    OP_READ = 1
    OP_WRITE = 2
    OP_RW = 3
    OP_INC = 4
    OP_MAX = 5
    OP_MIN = 6

    accsstring = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', 'OP_MAX', 'OP_MIN']

    any_soa = 0
    for nk in range(0, len(kernels)):
        any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

    for nk in range(0, len(kernels)):

        name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
                ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
                unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

        optidxs = [0] * nargs
        indopts = [-1] * nargs
        nopts = 0
        for i in range(0, nargs):
            if optflags[i] == 1 and maps[i] == OP_ID:
                optidxs[i] = nopts
                nopts = nopts + 1
            elif optflags[i] == 1 and maps[i] == OP_MAP:
                if i == invinds[
                        inds[i] -
                        1]:  #i.e. I am the first occurence of this dat+map combination
                    optidxs[i] = nopts
                    indopts[inds[i] - 1] = i
                    nopts = nopts + 1
                else:
                    optidxs[i] = optidxs[invinds[inds[i] - 1]]
#
# set two logicals
#
        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_MAP and accs[i] == OP_INC:
                j = i
        ind_inc = j >= 0

        j = -1
        for i in range(0, nargs):
            if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
                j = i
        reduct = j >= 0
        print name, reduct
        ##########################################################################
        #  start with OpenMP kernel function
        ##########################################################################

        FORTRAN = 0
        CPP = 1
        g_m = 0
        file_text = ''
        depth = 0

        comm('user function')

        if FORTRAN:
            code('include ' + name + '.inc')
        elif CPP:
            code('#include "../' + decl_filepath + '"')

        comm('')
        comm(' x86 kernel function')

        if FORTRAN:
            code('subroutine op_x86_' + name + '(')
        elif CPP:
            code('void op_x86_' + name + '(')

        depth = 2

        if ninds > 0:
            if FORTRAN:
                code('integer(4)  blockIdx,')
            elif CPP:
                code('int    blockIdx,')

        for g_m in range(0, ninds):
            if FORTRAN:
                code('INDTYP *ind_ARG,')
            elif CPP:
                code('INDTYP *ind_ARG,')

        if ninds > 0:
            if FORTRAN:
                code('int   *ind_map,')
                code('short *arg_map,')
            elif CPP:
                code('int   *ind_map,')
                code('short *arg_map,')

        for g_m in range(0, nargs):
            if maps[g_m] == OP_GBL and accs[g_m] == OP_READ:
                # declared const for performance
                if FORTRAN:
                    code('const TYP *ARG,')
                elif CPP:
                    code('const TYP *ARG,')
            elif maps[g_m] == OP_ID and ninds > 0:
                if FORTRAN:
                    code('ARG,')
                elif CPP:
                    code('TYP *ARG,')
            elif maps[g_m] == OP_GBL or maps[g_m] == OP_ID:
                if FORTRAN:
                    code('ARG,')
                elif CPP:
                    code('TYP *ARG,')

        if ninds > 0:
            if FORTRAN:
                code('ind_arg_sizes,')
                code('ind_arg_offs, ')
                code('block_offset, ')
                code('blkmap,       ')
                code('offset,       ')
                code('nelems,       ')
                code('ncolors,      ')
                code('colors,       ')
                code('set_size) {    ')
                code('')
            elif CPP:
                code('int   *ind_arg_sizes,')
                code('int   *ind_arg_offs, ')
                code('int    block_offset, ')
                code('int   *blkmap,       ')
                code('int   *offset,       ')
                code('int   *nelems,       ')
                code('int   *ncolors,      ')
                code('int   *colors,       ')
                code('int   set_size) {    ')
                code('')
        else:
            if FORTRAN:
                code('start, finish )')
            elif CPP:
                code('int  start, int  finish ) {')
            code('')

        for g_m in range(0, nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
                code('TYP  ARG_l[DIM];')

        for m in range(1, ninds + 1):
            g_m = m - 1
            v = [int(inds[i] == m) for i in range(len(inds))]
            v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m]
            if sum(v) > 1 and sum(v_i) > 0:
                if indaccs[m - 1] == OP_INC:
                    ind = int(
                        max([
                            idxs[i] for i in range(len(inds)) if inds[i] == m
                        ])) + 1
                    code('INDTYP *ARG_vec[' + str(ind) + '] = {')
                    depth += 2
                    for n in range(0, nargs):
                        if inds[n] == m:
                            g_m = n
                            code('ARG_l,')
                    depth -= 2
                    code('};')
                else:
                    ind = int(
                        max([
                            idxs[i] for i in range(len(inds)) if inds[i] == m
                        ])) + 1
                    if indaccs[m - 1] == OP_READ:
                        code('const INDTYP *ARG_vec[' + str(ind) + '];')
                    else:
                        code('INDTYP *ARG_vec[' + str(ind) + '];')
#
# lengthy code for general case with indirection
#
        if ninds > 0:
            code('')
            for g_m in range(0, ninds):
                code('int  *ind_ARG_map, ind_ARG_size;')
            for g_m in range(0, ninds):
                code('INDTYP *ind_ARG_s;')

            if FORTRAN:
                code('integer(4) :: nelem, offset_b, blockId')
                code('character :: shared[64000]')
            elif CPP:
                code('int    nelem, offset_b;')
                code('')
                code('char shared[128000];')

            code('')
            IF('0==0')
            code('')
            comm(' get sizes and shift pointers and direct-mapped data')
            code('')
            code('int blockId = blkmap[blockIdx + block_offset];')
            code('nelem    = nelems[blockId];')
            code('offset_b = offset[blockId];')
            code('')

            for g_m in range(0, ninds):
                code('ind_ARG_size = ind_arg_sizes[' + str(g_m) + '+blockId*' +
                     str(ninds) + '];')
            code('')
            for m in range(1, ninds + 1):
                g_m = m - 1
                c = [i for i in range(len(inds)) if inds[i] == m]
                code('ind_ARG_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\
                '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds)+'];')

            code('')
            comm(' set shared memory pointers')
            code('int nbytes = 0;')

            for g_m in range(0, ninds):
                code('ind_ARG_s = (INDTYP *) &shared[nbytes];')
                if g_m < ninds - 1:
                    code(
                        'nbytes += ROUND_UP(ind_ARG_size*sizeof(INDTYP)*INDDIM);'
                    )
            ENDIF()
            code('')
            comm(
                ' copy indirect datasets into shared memory or zero increment')
            code('')

            for g_m in range(0, ninds):
                if indaccs[g_m] == OP_READ or indaccs[g_m] == OP_RW or indaccs[
                        g_m] == OP_INC:
                    FOR('n', '0', 'INDARG_size')
                    FOR('d', '0', 'INDDIM')
                    if indaccs[g_m] == OP_READ or indaccs[g_m] == OP_RW:
                        code(
                            'INDARG_s[d+n*INDDIM] = INDARG[d+INDARG_map[n]*INDDIM];'
                        )
                        code('')
                    elif indaccs[g_m] == OP_INC:
                        code('INDARG_s[d+n*INDDIM] = ZERO_INDTYP;')
                    ENDFOR()
                    ENDFOR()

            code('')
            comm(' process set elements')
            code('')

            if ind_inc:
                FOR('n', '0', 'nelem')
                comm(' initialise local variables            ')
                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
                        FOR('d', '0', 'DIM')
                        code('ARG_l[d] = ZERO_TYP;')
                        ENDFOR()
            else:
                FOR('n', '0', 'nelem')

#
# simple alternative when no indirection
#
        else:
            comm(' process set elements')
            FOR('n', 'start', 'finish')

#
# kernel call
#
# xxx: array of pointers for non-locals
        for m in range(1, ninds + 1):
            s = [i for i in range(len(inds)) if inds[i] == m]
            if sum(s) > 1:
                if indaccs[m - 1] <> OP_INC:
                    code('')
                    ctr = 0
                    for n in range(0, nargs):
                        if inds[n] == m and vectorised[n]:
                            code('arg'+str(m-1)+'_vec['+str(ctr)+'] = ind_arg'+\
                            str(inds[n]-1)+'_s+arg_map['+str(cumulative_indirect_index[n])+\
                            '*set_size+n+offset_b]*'+str(dims[n])+';')
                            ctr = ctr + 1

        code('')
        comm(' user-supplied kernel call')

        line = name + '('
        prefix = ' ' * len(name)
        a = 0  #only apply indentation if its not the 0th argument
        indent = ''
        for m in range(0, nargs):
            if a > 0:
                indent = '     ' + ' ' * len(name)

            if maps[m] == OP_GBL:
                line += rep(indent + 'ARG,\n', m)
                a = a + 1
            elif maps[m] == OP_MAP and accs[m] == OP_INC and vectorised[m] == 0:
                line += rep(indent + 'ARG_l,\n', m)
                a = a + 1
            elif maps[m] == OP_MAP and vectorised[m] == 0:
                line += rep(indent+'ind_arg'+str(inds[m]-1)+'_s+arg_map['+\
                str(cumulative_indirect_index[m])+'*set_size+n+offset_b]*DIM,\n',m)
                a = a + 1
            elif maps[m] == OP_MAP and m == 0:
                line += rep(indent + 'ARG_vec,' + '\n', inds[m] - 1)
                a = a + 1
            elif maps[m] == OP_MAP and m > 0 and vectorised[m] <> vectorised[
                    m - 1]:  #xxx:vector
                line += rep(indent + 'ARG_vec,' + '\n', inds[m] - 1)
                a = a + 1
            elif maps[m] == OP_MAP and m > 0 and vectorised[m] == vectorised[
                    m - 1]:
                line = line
                a = a + 1
            elif maps[m] == OP_ID:
                if ninds > 0:
                    line += rep(indent + 'ARG+(n+offset_b)*DIM,' + '\n', m)
                    a = a + 1
                else:
                    line += rep(indent + 'ARG+n*DIM,' + '\n', m)
                    a = a + 1
            else:
                print 'internal error 1 '

        code(line[0:-2] + ');')  #remove final ',' and \n

        #
        # updating for indirect kernels ...
        #
        if ninds > 0:
            if ind_inc:
                code('')
                comm(' store local variables            ')

                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
                        code('int ARG_map = arg_map['+ str(cumulative_indirect_index[g_m])+\
                            '*set_size+n+offset_b];')
                code('')

                for g_m in range(0, nargs):
                    if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
                        FOR('d', '0', 'DIM')
                        code('ind_arg' + str(inds[g_m] - 1) +
                             '_s[d+ARG_map*DIM] += ARG_l[d];')
                        ENDFOR()

            ENDFOR()

            s = [i for i in range(1, ninds + 1) if indaccs[i - 1] <> OP_READ]

            if len(s) > 0 and max(s) > 0:
                code('')
                comm(' apply pointered write/increment')

            for g_m in range(0, ninds):
                if indaccs[g_m] == OP_WRITE or indaccs[
                        g_m] == OP_RW or indaccs[g_m] == OP_INC:
                    FOR('n', '0', 'INDARG_size')
                    FOR('d', '0', 'INDDIM')
                    if indaccs[g_m] == OP_WRITE or indaccs[g_m] == OP_RW:
                        code(
                            'INDARG[d+INDARG_map[n]*INDDIM] = INDARG_s[d+n*INDDIM];'
                        )
                    elif indaccs[g_m] == OP_INC:
                        code(
                            'INDARG[d+INDARG_map[n]*INDDIM] += INDARG_s[d+n*INDDIM];'
                        )
                    ENDFOR()
                    ENDFOR()
#
# ... and direct kernels
#
        else:
            depth -= 2
            code('}')

#
# global reduction
#
        depth -= 2
        code('}')
        code('')

        ##########################################################################
        # then C++ stub function
        ##########################################################################

        code('')
        comm(' host stub function          ')
        code('void op_par_loop_' + name + '(char const *name, op_set set,')
        depth += 2

        for m in unique_args:
            g_m = m - 1
            if m == unique_args[len(unique_args) - 1]:
                code('op_arg ARG){')
                code('')
            else:
                code('op_arg ARG,')

        for g_m in range(0, nargs):
            if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
                code('TYP*ARGh = (TYP *)ARG.data;')

        code('int nargs = ' + str(nargs) + ';')
        code('op_arg args[' + str(nargs) + '];')
        code('')

        for g_m in range(0, nargs):
            u = [
                i for i in range(0, len(unique_args))
                if unique_args[i] - 1 == g_m
            ]
            if len(u) > 0 and vectorised[g_m] > 0:
                code('ARG.idx = 0;')
                code('args[' + str(g_m) + '] = ARG;')

                v = [
                    int(vectorised[i] == vectorised[g_m])
                    for i in range(0, len(vectorised))
                ]
                first = [i for i in range(0, len(v)) if v[i] == 1]
                first = first[0]
                if (optflags[g_m] == 1):
                    argtyp = 'op_opt_arg_dat(arg' + str(first) + '.opt, '
                else:
                    argtyp = 'op_arg_dat('

                FOR('v', '1', str(sum(v)))
                code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
                str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');')
                ENDFOR()
                code('')
            elif vectorised[g_m] > 0:
                pass
            else:
                code('args[' + str(g_m) + '] = ARG;')

#
#   indirect bits
#
        if ninds > 0:
            code('int  ninds   = ' + str(ninds) + ';')
            line = 'int  inds[' + str(nargs) + '] = {'
            for m in range(0, nargs):
                line += str(inds[m] - 1) + ','
            code(line[:-1] + '};')
            code('')

            IF('OP_diags>2')
            code('printf(" kernel routine with indirection: ' + name +
                 '\\n");')
            ENDIF()

            code('')
            comm(' get plan')
            code('#ifdef OP_PART_SIZE_' + str(nk))
            code('  int part_size = OP_PART_SIZE_' + str(nk) + ';')
            code('#else')
            code('  int part_size = OP_part_size;')
            code('#endif')
            code('')
            code('int set_size = op_mpi_halo_exchanges(set, nargs, args);')

#
# direct bit
#
        else:
            code('')
            IF('OP_diags>2')
            code('printf(" kernel routine w/o indirection:  ' + name + '");')
            ENDIF()
            code('')
            code('op_mpi_halo_exchanges(set, nargs, args);')

#
# start timing
#
        code('')
        comm(' initialise timers')
        code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
        code('op_timers_core(&cpu_t1, &wall_t1);')
        code('')

        #
        # set number of threads in x86 execution and create arrays for reduction
        #

        if reduct or ninds == 0:
            comm(' set number of threads')
            code('#ifdef _OPENMP')
            code('  int nthreads = omp_get_max_threads();')
            code('#else')
            code('  int nthreads = 1;')
            code('#endif')

        if reduct:
            code('')
            comm(' allocate and initialise arrays for global reduction')
            for g_m in range(0, nargs):
                if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ and accs[
                        g_m] <> OP_WRITE:
                    code('TYP ARG_l[DIM+64*64];')
                    FOR('thr', '0', 'nthreads')
                    if accs[g_m] == OP_INC:
                        FOR('d', '0', 'DIM')
                        code('ARG_l[d+thr*64]=ZERO_TYP;')
                        ENDFOR()
                    else:
                        FOR('d', '0', 'DIM')
                        code('ARG_l[d+thr*64]=ARGh[d];')
                        ENDFOR()
                    ENDFOR()

        code('')
        IF('set->size >0')
        code('')

        #
        # kernel call for indirect version
        #
        if ninds > 0:
            code(
                'op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0);'
            )
            code('')
            comm(' execute plan')
            code('int block_offset = 0;')
            FOR('col', '0', 'Plan->ncolors')
            IF('col==Plan->ncolors_core')
            code('op_mpi_wait_all(nargs, args);')
            ENDIF()
            code('int nblocks = Plan->ncolblk[col];')
            code('')
            code('#pragma omp parallel for')
            FOR('blockIdx', '0', 'nblocks')
            code('op_x86_' + name + '( blockIdx,')

            for m in range(1, ninds + 1):
                g_m = invinds[m - 1]
                code('(TYP *)ARG.data,')

            code('Plan->ind_map,')
            code('Plan->loc_map,')

            for m in range(0, nargs):
                g_m = m
                if inds[m] == 0 and maps[m] == OP_GBL and accs[
                        m] <> OP_READ and accs[m] <> OP_WRITE:
                    code('&ARG_l[64*omp_get_thread_num()],')
                elif inds[m] == 0:
                    code('(TYP *)ARG.data,')

            code('Plan->ind_sizes,')
            code('Plan->ind_offs,')
            code('block_offset,')
            code('Plan->blkmap,')
            code('Plan->offset,')
            code('Plan->nelems,')
            code('Plan->nthrcol,')
            code('Plan->thrcol,')
            code('set_size);')
            ENDFOR()
            code('')

            if reduct:
                comm(' combine reduction data')
                IF('col == Plan->ncolors_owned-1')
                for m in range(0, nargs):
                    if maps[m] == OP_GBL and accs[m] <> OP_READ and accs[
                            m] <> OP_WRITE:
                        FOR('thr', '0', 'nthreads')
                        if accs[m] == OP_INC:
                            FOR('d', '0', 'DIM')
                            code('ARGh[d] += ARG_l[d+thr*64];')
                            ENDFOR()
                        elif accs[m] == OP_MIN:
                            FOR('d', '0', 'DIM')
                            code('ARGh[d]  = MIN(ARGh[d],ARG_l[d+thr*64]);')
                            ENDFOR()
                        elif accs(m) == OP_MAX:
                            FOR('d', '0', 'DIM')
                            code('ARGh[d]  = MAX(ARGh[d],ARG_l[d+thr*64]);')
                            ENDFOR()
                        else:
                            error('internal error: invalid reduction option')
                        ENDFOR()
                ENDIF()
            code('block_offset += nblocks;')
            ENDIF()

#
# kernel call for direct version
#
        else:
            comm(' execute plan')
            code('#pragma omp parallel for')
            FOR('thr', '0', 'nthreads')
            code('int start  = (set->size* thr)/nthreads;')
            code('int finish = (set->size*(thr+1))/nthreads;')
            code('op_x86_' + name + '(')

            for g_m in range(0, nargs):
                indent = ''
                if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ and accs[
                        g_m] <> OP_WRITE:
                    code(indent + 'ARG_l + thr*64,')
                else:
                    code(indent + '(TYP *) ARG.data,')

            code('start, finish );')
            ENDFOR()

        if ninds > 0:
            code('op_timing_realloc(' + str(nk) + ');')
            code('OP_kernels[' + str(nk) + '].transfer  += Plan->transfer; ')
            code('OP_kernels[' + str(nk) + '].transfer2 += Plan->transfer2;')

        ENDIF()
        code('')

        #
        # combine reduction data from multiple OpenMP threads, direct version
        #
        comm(' combine reduction data')
        for g_m in range(0, nargs):
            if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ and accs[
                    g_m] <> OP_WRITE and ninds == 0:
                FOR('thr', '0', 'nthreads')
                if accs[g_m] == OP_INC:
                    FOR('d', '0', 'DIM')
                    code('ARGh[d] += ARG_l[d+thr*64];')
                    ENDFOR()
                elif accs[g_m] == OP_MIN:
                    FOR('d', '0', 'DIM')
                    code('ARGh[d]  = MIN(ARGh[d],ARG_l[d+thr*64]);')
                    ENDFOR()
                elif accs[g_m] == OP_MAX:
                    FOR('d', '0', 'DIM')
                    code('ARGh[d]  = MAX(ARGh[d],ARG_l[d+thr*64]);')
                    ENDFOR()
                else:
                    print 'internal error: invalid reduction option'
                ENDFOR()
            if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ:
                code('op_mpi_reduce(&ARG,ARGh);')

        code('op_mpi_set_dirtybit(nargs, args);')
        code('')

        #
        # update kernel record
        #

        comm(' update kernel record')
        code('op_timers_core(&cpu_t2, &wall_t2);')
        code('op_timing_realloc(' + str(nk) + ');')
        code('OP_kernels[' + str(nk) + '].name      = name;')
        code('OP_kernels[' + str(nk) + '].count    += 1;')
        code('OP_kernels[' + str(nk) + '].time     += wall_t2 - wall_t1;')

        if ninds == 0:
            line = 'OP_kernels[' + str(nk) + '].transfer += (float)set->size *'

            for g_m in range(0, nargs):
                if optflags[g_m] == 1:
                    IF('ARG.opt')
                if maps[g_m] <> OP_GBL:
                    if accs[g_m] == OP_READ:
                        code(line + ' ARG.size;')
                    else:
                        code(line + ' ARG.size * 2.0f;')
                if optflags[g_m] == 1:
                    ENDIF()
        depth -= 2
        code('}')

        ##########################################################################
        #  output individual kernel file
        ##########################################################################
        if not os.path.exists('openmp'):
            os.makedirs('openmp')
        fid = open('openmp/' + name + '_kernel.cpp', 'w')
        date = datetime.datetime.now()
        fid.write('//\n// auto-generated by op2.py on ' +
                  date.strftime("%Y-%m-%d %H:%M") + '\n//\n\n')
        fid.write(file_text)
        fid.close()


# end of main kernel call loop

##########################################################################
#  output one master kernel file
##########################################################################

    file_text = ''

    comm(' global constants       ')

    for nc in range(0, len(consts)):
        if consts[nc]['dim'] == 1:
            code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                 consts[nc]['name'] + ';')
        else:
            if consts[nc]['dim'] > 0:
                num = str(consts[nc]['dim'])
            else:
                num = 'MAX_CONST_SIZE'

            code('extern ' + consts[nc]['type'][1:-1] + ' ' +
                 consts[nc]['name'] + '[' + num + '];')
    code('')

    comm(' header                 ')

    if os.path.exists('./user_types.h'):
        code('#include "../user_types.h"')
    code('#include "op_lib_cpp.h"       ')
    code('')

    comm(' user kernel files')

    for nk in range(0, len(kernels)):
        code('#include "' + kernels[nk]['name'] + '_kernel.cpp"')
    master = master.split('.')[0]
    fid = open('openmp/' + master.split('.')[0] + '_kernels.cpp', 'w')
    fid.write('//\n// auto-generated by op2.py on ' +
              date.strftime("%Y-%m-%d %H:%M") + '\n//\n\n')
    fid.write(file_text)
    fid.close()
示例#12
0
def op2_gen_cuda(master, date, consts, kernels, sets):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  any_soa = 0
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
           ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
           unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]
#
# set two logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_RW:
        j = i
    ind_rw= j >= 0
    if (ind_rw):
      print 'Error: indirect OP_RW not supported by op2_gen_cuda code generator'
      sys.exit(2);


    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
        j = i
    reduct = j >= 0

##########################################################################
#  start with CUDA kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

    comm('user function')

    code('__device__')
    if FORTRAN:
      code('include '+name+'.inc')
    elif CPP:
      code('#include "../'+name+'.h"')

    comm('')
    comm(' CUDA kernel function')

    if FORTRAN:
      code('subroutine op_cuda_'+name+'(')
    elif CPP:
      code('__global__ void op_cuda_'+name+'(')

    depth = 2

    for g_m in range(0,ninds):
      if FORTRAN:
        code('<INDTYP> *ind_<ARG>,')
      elif CPP:
        code('<INDTYP> *ind_<ARG>,')

    if ninds>0:
      if FORTRAN:
        code('int   *ind_map,')
        code('short *arg_map,')
      elif CPP:
        code('int   *ind_map,')
        code('short *arg_map,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m] == OP_READ:
        # declared const for performance
        if FORTRAN:
          code('const <TYP> *<ARG>,')
        elif CPP:
          code('const <TYP> *<ARG>,')
      elif maps[g_m]==OP_ID and ninds>0:
        if FORTRAN:
          code('<ARG>,')
        elif CPP:
          code('<TYP>  *<ARG>,')
      elif maps[g_m]==OP_GBL or maps[g_m]==OP_ID:
        if FORTRAN:
          code('<ARG>,')
        elif CPP:
          code('<TYP> *<ARG>,')

    if ninds>0:
      if FORTRAN:
        code('int   *ind_arg_sizes,')
        code('int   *ind_arg_offs, ')
        code('int    block_offset, ')
        code('int   *blkmap,       ')
        code('int   *offset,       ')
        code('int   *nelems,       ')
        code('int   *ncolors,      ')
        code('int   *colors,       ')
        code('int   nblocks,       ')
        code('int   set_size) {    ')
      if CPP:
        code('int   *ind_arg_sizes,')
        code('int   *ind_arg_offs, ')
        code('int    block_offset, ')
        code('int   *blkmap,       ')
        code('int   *offset,       ')
        code('int   *nelems,       ')
        code('int   *ncolors,      ')
        code('int   *colors,       ')
        code('int   nblocks,       ')
        code('int   set_size) {    ')
    else:
      code('int   offset_s,    ')
      code('int   set_size ) {')
      code('')

    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
        code('<TYP> <ARG>_l[<DIM>];')
        if accs[g_m] == OP_INC:
          FOR('d','0','<DIM>')
          code('<ARG>_l[d]=ZERO_<TYP>;')
          ENDFOR()
        else:
          FOR('d','0','<DIM>')
          code('<ARG>_l[d]=<ARG>[d+blockIdx.x*<DIM>];')
          ENDFOR()
      elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
        code('<TYP> <ARG>_l[<DIM>];')
      elif (ninds==0 and maps[g_m]==OP_ID and dims[g_m]<>'1') and not(soaflags[g_m]):
        code('<TYP> <ARG>_l[<DIM>];')

    for m in range (1,ninds+1):
      g_m = m -1
      v = [int(inds[i]==m) for i in range(len(inds))]
      v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m]
      if sum(v)>1 and sum(v_i)>0: #check this sum(v_i)
        if indaccs[m-1] == OP_INC:
          ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1
          code('<INDTYP> *<ARG>_vec['+str(ind)+'] = {'); depth += 2;
          for n in range(0,nargs):
            if inds[n] == m:
              g_m = n
              code('<ARG>_l,')
          depth -= 2
          code('};')
        else:
          ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1
          if indaccs[m-1] == OP_READ:
            code('const <INDTYP> *<ARG>_vec['+str(ind)+'];')
          else:
            code('<INDTYP> *<ARG>_vec['+str(ind)+'];')
#
# lengthy code for general case with indirection
#
    if ninds>0:
      code('')
      for g_m in range (0,ninds):
        code('__shared__  int  *ind_<ARG>_map, ind_<ARG>_size;')
      for g_m in range (0,ninds):
        code('__shared__  <INDTYP> *ind_<ARG>_s;')

      if ind_inc:
        code('__shared__ int    nelems2, ncolor;')

      code('__shared__ int    nelem, offset_b;')
      code('')
      code('extern __shared__ char shared[];')
      code('')
      IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks')
      code('return;')
      ENDIF()
      IF('threadIdx.x==0')
      code('')
      comm('get sizes and shift pointers and direct-mapped data')
      code('')
      code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x  + block_offset];')
      code('')
      code('nelem    = nelems[blockId];')
      code('offset_b = offset[blockId];')
      code('')

      if ind_inc:
        code('nelems2  = blockDim.x*(1+(nelem-1)/blockDim.x);')
        code('ncolor   = ncolors[blockId];')
        code('')

      for g_m in range (0,ninds):
        code('ind_<ARG>_size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds)+'];')

      code('')
      
      for m in range (1,ninds+1):
        g_m = m - 1
        c = [i for i in range(len(inds)) if inds[i]==m]
        code('ind_<ARG>_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\
        '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds)+'];')

      code('')
      comm('set shared memory pointers')
      code('int nbytes = 0;')

      for g_m in range(0,ninds):
        code('ind_<ARG>_s = (<INDTYP> *) &shared[nbytes];')
        if g_m < ninds-1:
          code('nbytes    += ROUND_UP(ind_<ARG>_size*sizeof(<INDTYP>)*<INDDIM>);')

      ENDIF()
      code('__syncthreads(); // make sure all of above completed')
      code('')
      comm('copy indirect datasets into shared memory or zero increment')
      code('')

      for m in range(0,ninds):
        g_m = m
        if indaccs[m]==OP_READ or indaccs[m]==OP_RW or indaccs[m]==OP_INC:
          FOR_INC('n','threadIdx.x','ind_<ARG>_size*<INDDIM>','blockDim.x')
          if indaccs[m]==OP_READ or indaccs[m]==OP_RW:
            code('ind_arg'+str(m)+'_s[n] = ind_arg'+str(m)+'[n%'+inddims[m]+
            '+ind_arg'+str(m)+'_map[n/'+inddims[m]+']*'+inddims[m]+'];')
            code('')
          elif indaccs[m]==OP_INC:
            code('ind_<ARG>_s[n] = ZERO_<INDTYP>;')
          ENDFOR()

      code('')
      code('__syncthreads();')
      comm('process set elements')
      code('')

      if ind_inc:
        FOR_INC('n','threadIdx.x','nelems2','blockDim.x')
        code('int col2 = -1;')
        IF('n<nelem')
        comm('initialise local variables')

        for g_m in range(0,nargs):
          if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
            FOR('d','0','<DIM>')
            code('<ARG>_l[d] = ZERO_<TYP>;')
            ENDFOR()
      else:
        FOR_INC('n','threadIdx.x','nelem','blockDim.x')

#
# simple alternative when no indirection
#
    else:
      use_shared = 0;
      for m in range(0,nargs):
        if maps[m]<>OP_GBL and dims[m]<>'1':
          use_shared = 1

      if use_shared:
        code('int   tid = threadIdx.x%OP_WARPSIZE;')
        code('')
        code('extern __shared__ char shared[];')
        code('char *arg_s = shared + offset_s*(threadIdx.x/OP_WARPSIZE);')

      code('')
      comm('process set elements')
      FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x')

      if use_shared:
        code('int offset = n - tid;')
        code('int nelems = MIN(OP_WARPSIZE,set_size-offset);')
        comm('copy data into shared memory, then into local')

      for m in range(0,nargs):
        g_m = m
        if (maps[m]<>OP_GBL and accs[m]<>OP_WRITE and dims[m]<>'1') and not(soaflags[m]):
          FOR('m','0','<DIM>')
          code('((<TYP> *)arg_s)[tid+m*nelems] = <ARG>[tid+m*nelems+offset*<DIM>];')
          ENDFOR()
          code('')
          FOR('m','0','<DIM>')
          code('<ARG>_l[m] = ((<TYP> *)arg_s)[m+tid*<DIM>];')
          ENDFOR()
          code('')



#
# kernel call
#

    # xxx: array of pointers for non-locals
    for m in range(1,ninds+1):
      s = [i for i in range(len(inds)) if inds[i]==m]
      if sum(s)>1:
        if indaccs[m-1] <> OP_INC:
          code('')
          ctr = 0
          for n in range(0,nargs):
            if inds[n] == m and vectorised[n]:
              code('arg'+str(m-1)+'_vec['+str(ctr)+'] = ind_arg'+\
              str(inds[n]-1)+'_s+arg_map['+str(cumulative_indirect_index[n])+\
              '*set_size+n+offset_b]*'+str(dims[n])+';')
              ctr = ctr+1

    code('')
    comm('user-supplied kernel call')

    line = name+'('
    prefix = ' '*len(name)
    a = 0 #only apply indentation if its not the 0th argument
    indent =''
    for m in range (0, nargs):
      if a > 0:
        indent = '     '+' '*len(name)

      if maps[m] == OP_GBL:
        if accs[m] == OP_READ or accs[m] == OP_WRITE:
          line += rep(indent+'<ARG>,\n',m)
        else:
          line += rep(indent+'<ARG>_l,\n',m);
        a =a+1
      elif maps[m]==OP_MAP and  accs[m]==OP_INC and vectorised[m]==0:
        line += rep(indent+'<ARG>_l,\n',m)
        a =a+1
      elif maps[m]==OP_MAP and vectorised[m]==0:
        line += rep(indent+'ind_arg'+str(inds[m]-1)+'_s+arg_map['+\
        str(cumulative_indirect_index[m])+'*set_size+n+offset_b]*<DIM>,'+'\n',m)
        a =a+1
      elif maps[m]==OP_MAP and m == 0:
        line += rep(indent+'<ARG>_vec,'+'\n',inds[m]-1)
        a =a+1
      elif maps[m]==OP_MAP and m>0 and vectorised[m] <> vectorised[m-1]: #xxx:vector
        line += rep(indent+'<ARG>_vec,'+'\n',inds[m]-1)
        a =a+1
      elif maps[m]==OP_MAP and m>0 and vectorised[m] == vectorised[m-1]:
        line = line
        a =a+1
      elif maps[m]==OP_ID:
        if ninds>0:
          if soaflags[m]:
            line += rep(indent+'<ARG>+(n+offset_b),\n',m)
          else:
            line += rep(indent+'<ARG>+(n+offset_b)*<DIM>,\n',m)
          a =a+1
        else:
          if dims[m] == '1' or soaflags[m]:
            line += rep(indent+'<ARG>+n,\n',m)
          else:
            line += rep(indent+'<ARG>_l,\n',m)
          a =a+1
      else:
        print 'internal error 1 '

    code(line[0:-2]+');') #remove final ',' and \n

#
# updating for indirect kernels ...
#
    if ninds>0:
      if ind_inc:
        code('col2 = colors[n+offset_b];')
        ENDIF()
        code('')
        comm('store local variables')
        code('')

        for g_m in range(0,nargs):
          if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
            code('int <ARG>_map;')

        IF('col2>=0')

        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
            code('<ARG>_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];')

        ENDIF()
        code('')
        FOR('col','0','ncolor')
        IF('col2==col')

        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
            FOR('d','0','<DIM>')
            code('ind_arg'+str(inds[g_m]-1)+'_s[d+<ARG>_map*<DIM>] += <ARG>_l[d];')
            ENDFOR()

        ENDFOR()
        code('__syncthreads();')
        ENDFOR()
      ENDFOR()

      s = [i for i in range(1,ninds+1) if indaccs[i-1]<> OP_READ]

      if len(s)>0 and max(s)>0:
        code('')
        comm('apply pointered write/increment')

      for g_m in range(0,ninds):
        if indaccs[g_m]==OP_WRITE or indaccs[g_m]==OP_RW or indaccs[g_m]==OP_INC:
          FOR_INC('n','threadIdx.x','<INDARG>_size*<INDDIM>','blockDim.x')
          if indaccs[g_m]==OP_WRITE or indaccs[g_m]==OP_RW:
            code('<INDARG>[n%<INDDIM>+<INDARG>_map[n/<INDDIM>]*<INDDIM>] = <INDARG>_s[n];')
          elif indaccs[g_m]==OP_INC:
            code('<INDARG>[n%<INDDIM>+<INDARG>_map[n/<INDDIM>]*<INDDIM>] += <INDARG>_s[n];')
          ENDFOR()
#
# ... and direct kernels
#
    else:
      if use_shared:
        comm('copy back into shared memory, then to device')
      for m in range(0,nargs):
        g_m = m
        if (maps[m]<>OP_GBL and accs[m]<>OP_READ and dims[m]<>'1') and not(soaflags[m]):
          code('')
          FOR('m','0','<DIM>')
          code('((<TYP> *)arg_s)[m+tid*<DIM>] = <ARG>_l[m];')
          ENDFOR()
          FOR('m','0','<DIM>')
          code('<ARG>[tid+m*nelems+offset*<DIM>] = ((<TYP> *)arg_s)[tid+m*nelems];')
          ENDFOR()

      depth -= 2
      code('}')

#
# global reduction
#
    if reduct:
       code('')
       comm('global reductions')
       code('')
       for m in range (0,nargs):
         g_m = m
         if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE:
           FOR('d','0','<DIM>')
           if accs[m]==OP_INC:
             code('op_reduction<OP_INC>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);')
           elif accs[m]==OP_MIN:
             code('op_reduction<OP_MIN>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);')
           elif accs[m]==OP_MAX:
             code('op_reduction<OP_MAX>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);')
           else:
             print 'internal error: invalid reduction option'
             sys.exit(2);
           ENDFOR()
    depth -= 2
    code('}')
    code('')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm('host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg <ARG>){')
        code('')
      else:
        code('op_arg <ARG>,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL:
        code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    #print vectorised

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('<ARG>.idx = 0;')
        code('args['+str(g_m)+'] = <ARG>;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')

      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = <ARG>;')

#
#   indirect bits
#
    if ninds>0:
      code('')
      code('int    ninds   = '+str(ninds)+';')
      line = '  int    inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

      code('')
      comm('get plan')
      code('#ifdef OP_PART_SIZE_'+ str(nk))
      code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
      code('#else')
      code('  int part_size = OP_part_size;')
      code('#endif')
      code('')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')
#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('')

    IF('set_size > 0')
    code('')
    code('op_timing_realloc('+str(nk)+');')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('')
    if any_soa:
      code('int op2_stride_internal = set->size + set->exec_size + set->nonexec_size;')
      #code('op_decl_const_char(1, "int", sizeof(int), (char *)&op2_stride, "op2_stride");')
      code('cutilSafeCall(cudaMemcpyToSymbol(op2_stride , &op2_stride_internal, sizeof(int)));');
      code('')

#
# kernel call for indirect version
#
    if ninds>0:
      code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);')
      code('')


#
# transfer constants
#
    g = [i for i in range(0,nargs) if maps[i] == OP_GBL and (accs[i] == OP_READ or accs[i] == OP_WRITE)]
    if len(g)>0:
      comm('transfer constants to GPU')
      code('int consts_bytes = 0;')
      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m]==OP_WRITE):
          code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));')

      code('reallocConstArrays(consts_bytes);')
      code('consts_bytes = 0;')

      for m in range(0,nargs):
        if maps[m]==OP_GBL and (accs[m]==OP_READ  or accs[m]==OP_WRITE):
          g_m = m
          code('<ARG>.data   = OP_consts_h + consts_bytes;')
          code('<ARG>.data_d = OP_consts_d + consts_bytes;')
          FOR('d','0','<DIM>')
          code('((<TYP> *)<ARG>.data)[d] = <ARG>h[d];')
          ENDFOR()
          code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));')
      code('mvConstArraysToDevice(consts_bytes);')
      code('')


#
# transfer global reduction initial data
#

    if ninds == 0:
      comm('set CUDA execution parameters')
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('  int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      comm('  int nthread = OP_block_size;')
      code('  int nthread = 128;')
      code('#endif')
      code('')
      code('int nblocks = 200;')
      code('')

    if reduct:
      comm('transfer global reduction data to GPU')
      if ninds>0:
        code('int maxblocks = 0;')
        FOR('col','0','Plan->ncolors')
        code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);')
        ENDFOR()
      else:
        code('int maxblocks = nblocks;')

      code('int reduct_bytes = 0;')
      code('int reduct_size  = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));')
          code('reduct_size   = MAX(reduct_size,sizeof(<TYP>));')

      code('reallocReductArrays(reduct_bytes);')
      code('reduct_bytes = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('<ARG>.data   = OP_reduct_h + reduct_bytes;')
          code('<ARG>.data_d = OP_reduct_d + reduct_bytes;')
          FOR('b','0','maxblocks')
          FOR('d','0','<DIM>')
          if accs[g_m]==OP_INC:
            code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = ZERO_<TYP>;')
          else:
            code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = <ARG>h[d];')
          ENDFOR()
          ENDFOR()
          code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));')
      code('mvReductArraysToDevice(reduct_bytes);')
      code('')

#
# kernel call for indirect version
#
    if ninds>0:
      comm('execute plan')
      code('')
      code('int block_offset = 0;')
      FOR('col','0','Plan->ncolors')
      IF('col==Plan->ncolors_core')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      code('int nthread = OP_block_size;')
      code('#endif')
      code('')
      code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],')
      code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);')
      IF('Plan->ncolblk[col] > 0')

      if reduct:
        code('int nshared = MAX(Plan->nshared,reduct_size*nthread);')
      else:
        code('int nshared = Plan->nsharedCol[col];')

      code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')

      for m in range(1,ninds+1):
        g_m = invinds[m-1]
        code('(<TYP> *)<ARG>.data_d,')

      code('Plan->ind_map,')
      code('Plan->loc_map,')

      for g_m in range(0,nargs):
        if inds[g_m]==0:
          code('(<TYP>*)<ARG>.data_d,')


      code('Plan->ind_sizes,')
      code('Plan->ind_offs,')
      code('block_offset,')
      code('Plan->blkmap,')
      code('Plan->offset,')
      code('Plan->nelems,')
      code('Plan->nthrcol,')
      code('Plan->thrcol,')
      code('Plan->ncolblk[col],')
      code('set_size);')
      code('')
      if reduct:
        comm('transfer global reduction data back to CPU')
        IF('col == Plan->ncolors_owned-1')
        code('mvReductArraysToHost(reduct_bytes);')
        ENDIF()

      ENDFOR()
      code('block_offset += Plan->ncolblk[col];')
      ENDIF()
#
# kernel call for direct version
#
    else:
      comm('work out shared memory requirements per element')
      code('')
      code('int nshared = 0;')

      for g_m in range(0,nargs):
         if maps[g_m]<>OP_GBL and dims[g_m]<>'1':
           code('nshared = MAX(nshared,sizeof(<TYP>)*<DIM>);')

      code('')
      comm('execute plan')
      code('int offset_s = nshared*OP_WARPSIZE;')
      code('')

      if reduct:
        code('nshared = MAX(nshared*nthread,reduct_size*nthread);')
      else:
        code('nshared = nshared*nthread;')

      code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')

      indent = '  '#*(len(name)+42)
      for g_m in range(0,nargs):
        if g_m > 0:
          code(indent+'(<TYP> *) <ARG>.data_d,')
        else:
          code(indent+'(<TYP> *) <ARG>.data_d,')

      code(indent+'offset_s,')
      code(indent+'set->size );')

    if ninds>0:
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')


#
# transfer global reduction initial data
#
    if reduct:
      if ninds == 0:
        comm('transfer global reduction data back to CPU')
        code('mvReductArraysToHost(reduct_bytes);')

      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m]<>OP_WRITE:
          FOR('b','0','maxblocks')
          FOR('d','0','<DIM>')
          if accs[m]==OP_INC:
            code('<ARG>h[d] = <ARG>h[d] + ((<TYP> *)<ARG>.data)[d+b*<DIM>];')
          elif accs[m]==OP_MIN:
            code('<ARG>h[d] = MIN(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);')
          elif accs[m]==OP_MAX:
            code('<ARG>h[d] = MAX(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);')
          ENDFOR()
          ENDFOR()

          code('<ARG>.data = (char *)<ARG>h;')
          code('op_mpi_reduce(&<ARG>,<ARG>h);')

    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        code('mvConstArraysToHost(consts_bytes);')
        break

    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        FOR('d','0','<DIM>')
        code('<ARG>h[d] = ((<TYP> *)<ARG>.data)[d];') 
        ENDFOR()
        code('<ARG>.data = (char *)<ARG>h;')
        code('op_mpi_reduce(&<ARG>,<ARG>h);')

    ENDIF()
    code('op_mpi_set_dirtybit_cuda(nargs, args);')

#
# update kernel record
#

    comm('update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ or accs[g_m]==OP_WRITE:
            code(line+' <ARG>.size;')
          else:
            code(line+' <ARG>.size * 2.0f;')

    depth = depth - 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('cuda'):
        os.makedirs('cuda')
    fid = open('cuda/'+name+'_kernel.cu','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text = ''

  comm('global constants')

  code('#ifndef MAX_CONST_SIZE')
  code('#define MAX_CONST_SIZE 128')
  code('#endif')
  code('')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'

      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];')
  code('')

  comm('header')
  if os.path.exists('./user_types.h'):
    code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif')
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"')
  code('#include "op_cuda_rt_support.h"')
  code('#include "op_cuda_reduction.h"')
  code('')

  # if any_soa:
  #   code('__constant__ int op2_stride;')
  #   code('')
  #   code('#define OP2_STRIDE(arr, idx) arr[op2_stride*(idx)]')

  code('')
  code('void op_decl_const_char(int dim, char const *type,')
  code('int size, char *dat, char const *name){')
  depth = depth + 2

  for nc in range(0,len(consts)):
    IF('!strcmp(name,"'+consts[nc]['name']+'")')
    if consts[nc]['dim'] < 0:
      IF('!strcmp(name,"'+consts[nc]['name']+'") && size>MAX_CONST_SIZE) {')
      code('printf("error: MAX_CONST_SIZE not big enough\n"); exit(1);')
      ENDIF()
    code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+', dat, dim*size));')
    ENDIF()
    code('else ')

  code('{')
  depth = depth + 2
  code('printf("error: unknown const name\\n"); exit(1);')
  ENDIF()


  depth = depth - 2
  code('}')
  code('')
  comm('user kernel files')

  for nk in range(0,len(kernels)):
    file_text = file_text +\
    '#include "'+kernels[nk]['name']+'_kernel.cu"\n'

  master = master.split('.')[0]
  fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w')
  fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n')
  fid.write(file_text)
  fid.close()
示例#13
0
def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  inc_stage=0
  op_color2=0
  op_color2_force=0
##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
           ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
           unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk], inc_stage)

    any_soa = 0
    any_soa = any_soa or sum(soaflags)
#
# set logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_RW:
        j = i
    ind_rw = j >= 0

    if ind_rw or op_color2_force:
        op_color2 = 1
    else:
        op_color2 = 0

    #no staging with 2 level colouring
    if op_color2:
      inc_stage=0

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
        j = i
    reduct = j >= 0

    if inc_stage:
      ninds_staged = 0
      inds_staged = [-1]*nargs
      for i in range(0,nargs):
        if maps[i]==OP_MAP and accs[i]==OP_INC:
          if inds_staged[invinds[inds[i]-1]] == -1:
            inds_staged[i] = ninds_staged
            ninds_staged = ninds_staged + 1
          else:
            inds_staged[i] = inds_staged[invinds[inds[i]-1]]
      invinds_staged = [-1]*ninds_staged
      inddims_staged = [-1]*ninds_staged
      indopts_staged = [-1]*ninds_staged
      for i in range(0,nargs):
        if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1:
          invinds_staged[inds_staged[i]] = i
          inddims_staged[inds_staged[i]] = dims[i]
          if optflags[i] == 1:
            indopts_staged[inds_staged[i]] = i
      for i in range(0,nargs):
        inds_staged[i] = inds_staged[i] + 1

##########################################################################
#  start with CUDA kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0


    #strides for SoA
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('__constant__ int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;')
            code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;')
      dir_soa = -1
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1):
          code('__constant__ int direct_'+name+'_stride_OP2CONSTANT;')
          code('int direct_'+name+'_stride_OP2HOST=-1;')
          dir_soa = g_m
          break

    file_name = decl_filepath

    f = open(file_name, 'r')
    kernel_text = f.read()
    f.close()

    if CPP:
      includes = op2_gen_common.extract_includes(kernel_text)
      if len(includes) > 0:
        for include in includes:
          code(include)
        code("")

    comm('user function')
    
    kernel_text = op2_gen_common.comment_remover(kernel_text)
    kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text)

    p = re.compile('void\\s+\\b'+name+'\\b')
    i = p.search(kernel_text).start()

    if(i < 0):
      print "\n********"
      print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation"
      exit(2)
    i2 = i

    #i = kernel_text[0:i].rfind('\n') #reverse find
    j = kernel_text[i:].find('{')
    k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}')
    signature_text = kernel_text[i:i+j]
    l = signature_text[0:].find('(')
    head_text = signature_text[0:l].strip() #save function name
    m = op2_gen_common.para_parse(signature_text, 0, '(', ')')
    signature_text = signature_text[l+1:m]
    body_text = kernel_text[i+j+1:k]

    # check for number of arguments
    if len(signature_text.split(',')) != nargs_novec:
        print 'Error parsing user kernel('+name+'): must have '+str(nargs)+' arguments'
        return

    for i in range(0,nargs_novec):
        var = signature_text.split(',')[i].strip()
        if kernels[nk]['soaflags'][i] and (op_color2 or  not (kernels[nk]['maps'][i] == OP_MAP and kernels[nk]['accs'][i] == OP_INC)):
          var = var.replace('*','')
          #locate var in body and replace by adding [idx]
          length = len(re.compile('\\s+\\b').split(var))
          var2 = re.compile('\\s+\\b').split(var)[length-1].strip()

          if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP:
            body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)
          else:
            body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text)
            body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \
                               op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text)

    signature_text = '__device__ '+head_text + '_gpu( '+signature_text + ') {'
    file_text += signature_text + body_text + '}\n'

    comm('')
    comm(' CUDA kernel function')

    if FORTRAN:
      code('subroutine op_cuda_'+name+'(')
    elif CPP:
      code('__global__ void op_cuda_'+name+'(')

    depth = 2

    if nopts > 0:
      code('int optflags,')
      
    for g_m in range(0,ninds):
      if (indaccs[g_m]==OP_READ):
        code('const INDTYP *__restrict INDARG,')
      else:
        code('INDTYP *__restrict INDARG,')

    if nmaps > 0:
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
          k = k + [mapnames[g_m]]
          code('const int *__restrict opDat'+str(invinds[inds[g_m]-1])+'Map, ')



    for g_m in range(0,nargs):
      if maps[g_m] == OP_ID:
        if accs[g_m] == OP_READ:
          code('const TYP *__restrict ARG,')
        else:
          code('TYP *ARG,')
      elif maps[g_m] == OP_GBL:
        if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE:
          code('TYP *ARG,')
        elif accs[g_m] == OP_READ:
          code('const TYP *ARG,')

    if ind_inc and inc_stage==1:
      code('int   *ind_map,')
      code('short *arg_map,')
      code('int   *ind_arg_sizes,')
      code('int   *ind_arg_offs, ')

    if ninds>0:
      if not op_color2:
        code('int    block_offset, ')
        code('int   *blkmap,       ')
        code('int   *offset,       ')
        code('int   *nelems,       ')
        code('int   *ncolors,      ')
        code('int   *colors,       ')
        code('int   nblocks,       ')
      else:
        code('int start,           ')
        code('int end,             ')
        code('int *col_reord,      ')
      code('int   set_size) {    ')
    else:
      code('int   set_size ) {')
      code('')


    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE:
        code('TYP ARG_l[DIM];')
        if accs[g_m] == OP_INC:
          FOR('d','0','DIM')
          code('ARG_l[d]=ZERO_TYP;')
          ENDFOR()
        else:
          FOR('d','0','DIM')
          code('ARG_l[d]=ARG[d+blockIdx.x*DIM];')
          ENDFOR()
      elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC and not op_color2:
        code('TYP ARG_l[DIM];')

    if not op_color2:
      for m in range (1,ninds+1):
        g_m = m -1
        v = [int(inds[i]==m) for i in range(len(inds))]
        v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m]
        if sum(v)>1 and sum(v_i)>0: #check this sum(v_i)
          if indaccs[m-1] == OP_INC:
            ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1
            code('INDTYP *arg'+str(invinds[m-1])+'_vec['+str(ind)+'] = {'); depth += 2;
            for n in range(0,nargs):
              if inds[n] == m:
                g_m = n
                code('ARG_l,')
            depth -= 2
            code('};')
#
# lengthy code for general case with indirection
#
    if ninds>0 and not op_color2:
      code('')
      if inc_stage==1:
        for g_m in range (0,ninds):
          if indaccs[g_m] == OP_INC:
            code('__shared__  int  *INDARG_map, INDARG_size;')
            code('__shared__  INDTYP *INDARG_s;')
        code('')
      if ind_inc:
        code('__shared__ int    nelems2, ncolor;')

      code('__shared__ int    nelem, offset_b;')
      code('')
      code('extern __shared__ char shared[];')
      code('')
      IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks')
      code('return;')
      ENDIF()
      IF('threadIdx.x==0')
      code('')
      comm('get sizes and shift pointers and direct-mapped data')
      code('')
      code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x  + block_offset];')
      code('')
      code('nelem    = nelems[blockId];')
      code('offset_b = offset[blockId];')
      code('')

      if ind_inc:
        code('nelems2  = blockDim.x*(1+(nelem-1)/blockDim.x);')
        code('ncolor   = ncolors[blockId];')
        code('')

      if inc_stage==1 and ind_inc:
        for g_m in range (0,ninds_staged):
          if indopts_staged[g_m-1] > 0:
            IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]]))
          code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds_staged)+'];')
          if indopts_staged[g_m-1] > 0:
            ENDIF()

        code('')
        for m in range (1,ninds_staged+1):
          g_m = m - 1
          c = [i for i in range(nargs) if inds_staged[i]==m]
          code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\
          '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds_staged)+'];')

        code('')
        comm('set shared memory pointers')
        code('int nbytes = 0;')

        for g_m in range(0,ninds_staged):
          code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_s = ('+typs[invinds_staged[g_m]]+' *) &shared[nbytes];')
          if g_m < ninds_staged-1:
            if indopts_staged[g_m-1] > 0:
              IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]]))
            code('nbytes    += ROUND_UP(ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size*sizeof('+typs[invinds_staged[g_m]]+')*'+dims[invinds_staged[g_m]]+');')
            if indopts_staged[g_m-1] > 0:
              ENDIF()


      ENDIF()
      code('__syncthreads(); // make sure all of above completed')
      code('')

      if inc_stage==1:
        for g_m in range(0,ninds):
          if indaccs[g_m] == OP_INC:
            FOR_INC('n','threadIdx.x','ind_ARG_size*INDDIM','blockDim.x')
            code('ind_ARG_s[n] = ZERO_INDTYP;')
            ENDFOR()
        if ind_inc:
          code('')
          code('__syncthreads();')
          code('')

      if ind_inc:
        FOR_INC('n','threadIdx.x','nelems2','blockDim.x')
        code('int col2 = -1;')
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')
        IF('n<nelem')
        comm('initialise local variables')

        for g_m in range(0,nargs):
          if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
            FOR('d','0','DIM')
            code('ARG_l[d] = ZERO_TYP;')
            ENDFOR()
      else:
        FOR_INC('n','threadIdx.x','nelem','blockDim.x')
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx;')

      #non-optional maps
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k):
          k = k + [(0*nargs+mapinds[g_m])] #non-opt
          k = k + [(1*nargs+mapinds[g_m])] #opt
          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];')

      #whatever didn't come up and is opt
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in      k)):
          k = k + [(optflags[g_m]*nargs+mapinds[g_m])]
          if optflags[g_m]==1:
            IF('optflags & 1<<'+str(optidxs[g_m]))

          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];')
          if optflags[g_m]==1:
            ENDIF()
 
      code('')
      for g_m in range (0,nargs):
        if accs[g_m] <> OP_INC: #TODO: add opt handling here
          u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
          if len(u) > 0 and vectorised[g_m] > 0:
            if accs[g_m] == OP_READ:
              line = 'const TYP* ARG_vec[] = {\n'
            else:
              line = 'TYP* ARG_vec[] = {\n'

            v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
            first = [i for i in range(0,len(v)) if v[i] == 1]
            first = first[0]

            indent = ' '*(depth+2)
            for k in range(0,sum(v)):
              if soaflags[g_m]:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n'
              else:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[DIM * map'+str(mapinds[g_m+k])+'idx],\n'
            line = line[:-2]+'};'
            code(line)
#
# simple version for global coloring
#
    elif ninds>0:
      code('int tid = threadIdx.x + blockIdx.x * blockDim.x;')
      code('if(tid + start >= end) return;')
      code('int n = col_reord[tid + start];')
      comm('initialise local variables')
      #mapidx declarations
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
          k = k + [mapinds[g_m]]
          code('int map'+str(mapinds[g_m])+'idx;')

      #non-optional maps
      k = []
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k):
          k = k + [(0*nargs+mapinds[g_m])] #non-opt
          k = k + [(1*nargs+mapinds[g_m])] #opt
          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];')

      #whatever didn't come up and is opt
      for g_m in range(0,nargs):
        if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in      k)):
          k = k + [(optflags[g_m]*nargs+mapinds[g_m])]
          if optflags[g_m]==1:
            IF('optflags & 1<<'+str(optidxs[g_m]))

          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];')
          if optflags[g_m]==1:
            ENDIF()

      for g_m in range (0,nargs):
          u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
          if len(u) > 0 and vectorised[g_m] > 0:
            if accs[g_m] == OP_READ:
              line = 'const TYP* ARG_vec[] = {\n'
            else:
              line = 'TYP* ARG_vec[] = {\n'

            v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
            first = [i for i in range(0,len(v)) if v[i] == 1]
            first = first[0]

            indent = ' '*(depth+2)
            for k in range(0,sum(v)):
              if soaflags[g_m]:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n'
              else:
                line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[DIM * map'+str(mapinds[g_m+k])+'idx],\n'
            line = line[:-2]+'};'
            code(line)




#
# simple alternative when no indirection
#
    else:
      code('')
      comm('process set elements')
      FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x')

#
# kernel call
#
    code('')
    comm('user-supplied kernel call')
    line = name+'_gpu('
    prefix = ' '*len(name)
    a = 0 #only apply indentation if its not the 0th argument
    indent =''
    for m in range (0, nargs):
      if a > 0:
        indent = '     '+' '*len(name)

      if maps[m] == OP_GBL:
        if accs[m] == OP_READ or accs[m] == OP_WRITE:
          line += rep(indent+'ARG,\n',m)
        else:
          line += rep(indent+'ARG_l,\n',m);
        a =a+1
      elif maps[m]==OP_MAP and  accs[m]==OP_INC and not op_color2:
        if vectorised[m]:
          if m+1 in unique_args:
            line += rep(indent+'ARG_vec,\n',m)
        else:
          line += rep(indent+'ARG_l,\n',m)
        a =a+1
      elif maps[m]==OP_MAP:
        if vectorised[m]:
          if m+1 in unique_args:
            line += rep(indent+'ARG_vec,\n',m)
        else:
          if soaflags[m]:
            line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx,'+'\n',m)
          else:
            line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx*DIM,'+'\n',m)
        a =a+1
      elif maps[m]==OP_ID:
        if ninds>0 and not op_color2:
          if soaflags[m]:
            line += rep(indent+'ARG+(n+offset_b),\n',m)
          else:
            line += rep(indent+'ARG+(n+offset_b)*DIM,\n',m)
          a =a+1
        else:
          if soaflags[m]:
            line += rep(indent+'ARG+n,\n',m)
          else:
            line += rep(indent+'ARG+n*DIM,\n',m)
          a =a+1
      else:
        print 'internal error 1 '

    code(line[0:-2]+');') #remove final ',' and \n

#
# updating for indirect kernels ...
#
    if ninds>0 and not op_color2:
      if ind_inc:
        code('col2 = colors[n+offset_b];')
        ENDIF()
        code('')
        comm('store local variables')
        code('')
        if inc_stage==1:
          for g_m in range(0,nargs):
            if maps[g_m]==OP_MAP and accs[g_m]==OP_INC:
              code('int ARG_map;')
          IF('col2>=0')
          for g_m in range(0,nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              code('ARG_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];')
          ENDIF()
          code('')

        FOR('col','0','ncolor')
        IF('col2==col')

        if inc_stage==1:
          for g_m in range(0,nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              if optflags[g_m]==1:
                IF('optflags & 1<<'+str(optidxs[g_m]))
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('ARG_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s[ARG_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size];')
                else:
                  code('ARG_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+ARG_map*DIM];')
#          for g_m in range(0,nargs):
#            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('ind_arg'+str(inds[g_m]-1)+'_s[ARG_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size] = ARG_l['+str(d)+'];')
                else:
                  code('ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+ARG_map*DIM] = ARG_l['+str(d)+'];')
                
              if optflags[g_m]==1:
                ENDIF()
        else:
          for g_m in range(0,nargs):
            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              if optflags[g_m]==1:
                IF('optflags & 1<<'+str(optidxs[g_m]))
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('ARG_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx];')
                else:
                  code('ARG_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*DIM];')
#          for g_m in range(0,nargs):
#            if maps[g_m] == OP_MAP and accs[g_m] == OP_INC:
              for d in range(0,int(dims[g_m])):
                if soaflags[g_m]:
                  code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx] = ARG_l['+str(d)+'];')
                else:
                  code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*DIM] = ARG_l['+str(d)+'];')
              if optflags[g_m]==1:
                ENDIF()

        ENDFOR()
        code('__syncthreads();')
        ENDFOR()
    if ninds == 0 or not op_color2:
      ENDFOR()

    if inc_stage:
      for g_m in range(0,ninds):
        if indaccs[g_m]==OP_INC:
          if indopts[g_m] > 0:
            IF('optflags & 1<<'+str(optidxs[indopts[g_m-1]]))
          if soaflags[invinds[g_m]]:
            FOR_INC('n','threadIdx.x','INDARG_size','blockDim.x')
            for d in range(0,int(dims[invinds[g_m]])):
              code('arg'+str(invinds[g_m])+'_l['+str(d)+'] = INDARG_s[n+'+str(d)+'*INDARG_size] + INDARG[INDARG_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'];')
            for d in range(0,int(dims[invinds[g_m]])):
              code('INDARG[INDARG_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'] = arg'+str(invinds[g_m])+'_l['+str(d)+'];')
            ENDFOR()
          else:
            FOR_INC('n','threadIdx.x','INDARG_size*INDDIM','blockDim.x')
            code('INDARG[n%INDDIM+INDARG_map[n/INDDIM]*INDDIM] += INDARG_s[n];')
            ENDFOR()
          if indopts[g_m] > 0:
            ENDIF()

#
# global reduction
#
    if reduct:
       code('')
       comm('global reductions')
       code('')
       for m in range (0,nargs):
         g_m = m
         if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE:
           FOR('d','0','DIM')
           if accs[m]==OP_INC:
             code('op_reduction<OP_INC>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);')
           elif accs[m]==OP_MIN:
             code('op_reduction<OP_MIN>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);')
           elif accs[m]==OP_MAX:
             code('op_reduction<OP_MAX>(&ARG[d+blockIdx.x*DIM],ARG_l[d]);')
           else:
             print 'internal error: invalid reduction option'
             sys.exit(2);
           ENDFOR()
    depth -= 2
    code('}')
    code('')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm('host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg ARG){')
        code('')
      else:
        code('op_arg ARG,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL:
        code('TYP*ARGh = (TYP *)ARG.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')


    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('ARG.idx = 0;')
        code('args['+str(g_m)+'] = ARG;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')

      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = ARG;')

    if nopts>0:
      code('int optflags = 0;')
      for i in range(0,nargs):
        if optflags[i] == 1:
          IF('args['+str(i)+'].opt')
          code('optflags |= 1<<'+str(optidxs[i])+';')
          ENDIF()
    if nopts > 30:
      print 'ERROR: too many optional arguments to store flags in an integer'
#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('')

#
#   indirect bits
#
    if ninds>0:
      code('')
      code('int    ninds   = '+str(ninds)+';')
      line = 'int    inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

      code('')
      comm('get plan')
      code('#ifdef OP_PART_SIZE_'+ str(nk))
      code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
      code('#else')
      code('  int part_size = OP_part_size;')
      code('#endif')
      code('')
      code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('op_mpi_halo_exchanges_cuda(set, nargs, args);')

    IF('set->size > 0')
    code('')

#
# kernel call for indirect version
#
    if ninds>0:
      if inc_stage==1 and ind_inc:
        code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);')
      elif op_color2:
        code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);')
      else:
        code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);')
      code('')


#
# transfer constants
#
    g = [i for i in range(0,nargs) if maps[i] == OP_GBL and (accs[i] == OP_READ or accs[i] == OP_WRITE)]
    if len(g)>0:
      comm('transfer constants to GPU')
      code('int consts_bytes = 0;')
      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m] == OP_WRITE):
          code('consts_bytes += ROUND_UP(DIM*sizeof(TYP));')

      code('reallocConstArrays(consts_bytes);')
      code('consts_bytes = 0;')

      for m in range(0,nargs):
        if maps[m]==OP_GBL and (accs[m] == OP_READ or accs[m] == OP_WRITE):
          g_m = m
          code('ARG.data   = OP_consts_h + consts_bytes;')
          code('ARG.data_d = OP_consts_d + consts_bytes;')
          FOR('d','0','DIM')
          code('((TYP *)ARG.data)[d] = ARGh[d];')
          ENDFOR()
          code('consts_bytes += ROUND_UP(DIM*sizeof(TYP));')
      code('mvConstArraysToDevice(consts_bytes);')
      code('')

      #managing constants
    if any_soa:
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))')
            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');')
            code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));')
            ENDIF()
      if dir_soa<>-1:
          IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))')
          code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');')
          code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));')
          ENDIF()

#
# transfer global reduction initial data
#

    if ninds == 0:
      comm('set CUDA execution parameters')
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('  int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      code('  int nthread = OP_block_size;')
      comm('  int nthread = 128;')
      code('#endif')
      code('')
      code('int nblocks = 200;')
      code('')

    if reduct:
      comm('transfer global reduction data to GPU')
      if ninds>0:
        code('int maxblocks = 0;')
        FOR('col','0','Plan->ncolors')
        code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);')
        ENDFOR()
      else:
        code('int maxblocks = nblocks;')

      code('int reduct_bytes = 0;')
      code('int reduct_size  = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('reduct_bytes += ROUND_UP(maxblocks*DIM*sizeof(TYP));')
          code('reduct_size   = MAX(reduct_size,sizeof(TYP));')

      code('reallocReductArrays(reduct_bytes);')
      code('reduct_bytes = 0;')

      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE:
          code('ARG.data   = OP_reduct_h + reduct_bytes;')
          code('ARG.data_d = OP_reduct_d + reduct_bytes;')
          FOR('b','0','maxblocks')
          FOR('d','0','DIM')
          if accs[g_m]==OP_INC:
            code('((TYP *)ARG.data)[d+b*DIM] = ZERO_TYP;')
          else:
            code('((TYP *)ARG.data)[d+b*DIM] = ARGh[d];')
          ENDFOR()
          ENDFOR()
          code('reduct_bytes += ROUND_UP(maxblocks*DIM*sizeof(TYP));')
      code('mvReductArraysToDevice(reduct_bytes);')
      code('')

#
# kernel call for indirect version
#
    if ninds>0:
      comm('execute plan')
      if not op_color2:
        code('')
        code('int block_offset = 0;')
      FOR('col','0','Plan->ncolors')
      IF('col==Plan->ncolors_core')
      code('op_mpi_wait_all_cuda(nargs, args);')
      ENDIF()
      code('#ifdef OP_BLOCK_SIZE_'+str(nk))
      code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';')
      code('#else')
      code('int nthread = OP_block_size;')
      code('#endif')
      code('')
      if op_color2:
        code('int start = Plan->col_offsets[0][col];')
        code('int end = Plan->col_offsets[0][col+1];')
        code('int nblocks = (end - start - 1)/nthread + 1;')
      else:
        code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],')
        code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);')
        IF('Plan->ncolblk[col] > 0')

      if reduct or (inc_stage==1 and ind_inc):
        if reduct and inc_stage==1:
          code('int nshared = MAX(Plan->nshared,reduct_size*nthread);')
        elif reduct:
          code('int nshared = reduct_size*nthread;')
        else:
          code('int nshared = Plan->nsharedCol[col];')
        code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')
      else:
        code('op_cuda_'+name+'<<<nblocks,nthread>>>(')

      if nopts > 0:
        code('optflags,')
      for m in range(1,ninds+1):
        g_m = invinds[m-1]
        code('(TYP *)ARG.data_d,')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('arg'+str(g_m)+'.map_data_d, ')
      for g_m in range(0,nargs):
        if inds[g_m]==0:
          code('(TYP*)ARG.data_d,')

      if inc_stage==1 and ind_inc:
        code('Plan->ind_map,')
        code('Plan->loc_map,')
        code('Plan->ind_sizes,')
        code('Plan->ind_offs,')
      if op_color2:
        code('start,')
        code('end,')
        code('Plan->col_reord,')
      else:
        code('block_offset,')
        code('Plan->blkmap,')
        code('Plan->offset,')
        code('Plan->nelems,')
        code('Plan->nthrcol,')
        code('Plan->thrcol,')
        code('Plan->ncolblk[col],')
      code('set->size+set->exec_size);')
      code('')
      if reduct:
        comm('transfer global reduction data back to CPU')
        IF('col == Plan->ncolors_owned-1')
        code('mvReductArraysToHost(reduct_bytes);')
        ENDIF()
      if not op_color2:
        ENDFOR() #TODO sztem ez forditva van...
        code('block_offset += Plan->ncolblk[col];')
      ENDIF()
#
# kernel call for direct version
#
    else:
      if reduct:
        code('int nshared = reduct_size*nthread;')
        code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(')
      else:
        code('op_cuda_'+name+'<<<nblocks,nthread>>>(')

      indent = '  '#*(len(name)+42)
      if nopts > 0:
        code(indent+'optflags,')
      for g_m in range(0,nargs):
        if g_m > 0:
          code(indent+'(TYP *) ARG.data_d,')
        else:
          code(indent+'(TYP *) ARG.data_d,')

      code(indent+'set->size );')

    if ninds>0:
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')


#
# transfer global reduction initial data
#
    if reduct:
      if ninds == 0:
        comm('transfer global reduction data back to CPU')
        code('mvReductArraysToHost(reduct_bytes);')

      for m in range(0,nargs):
        g_m = m
        if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE:
          FOR('b','0','maxblocks')
          FOR('d','0','DIM')
          if accs[m]==OP_INC:
            code('ARGh[d] = ARGh[d] + ((TYP *)ARG.data)[d+b*DIM];')
          elif accs[m]==OP_MIN:
            code('ARGh[d] = MIN(ARGh[d],((TYP *)ARG.data)[d+b*DIM]);')
          elif accs[m]==OP_MAX:
            code('ARGh[d] = MAX(ARGh[d],((TYP *)ARG.data)[d+b*DIM]);')
          ENDFOR()
          ENDFOR()

          code('ARG.data = (char *)ARGh;')
          code('op_mpi_reduce(&ARG,ARGh);')
          
    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        code('')
        code('mvConstArraysToHost(consts_bytes);')
        break

    for g_m in range(0,nargs):
      if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE:
        FOR('d','0','DIM')
        code('ARGh[d] = ((TYP *)ARG.data)[d];') 
        ENDFOR()
        code('ARG.data = (char *)ARGh;')
        code('op_mpi_reduce(&ARG,ARGh);')

    ENDIF()
    code('op_mpi_set_dirtybit_cuda(nargs, args);')

#
# update kernel record
#

    code('cutilSafeCall(cudaDeviceSynchronize());')
    comm('update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('ARG.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' ARG.size;')
          else:
            code(line+' ARG.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()
    depth = depth - 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('cuda'):
        os.makedirs('cuda')
    fid = open('cuda/'+name+'_kernel.cu','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text = ''

  comm('global constants')

  code('#ifndef MAX_CONST_SIZE')
  code('#define MAX_CONST_SIZE 128')
  code('#endif')
  code('')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'

      code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];')
  code('')

  comm('header')

  if os.path.exists('./user_types.h'):
    code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif')
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"')
  code('#include "op_cuda_rt_support.h"')
  code('#include "op_cuda_reduction.h"')

  code('')
  code('void op_decl_const_char(int dim, char const *type,')
  code('int size, char *dat, char const *name){')
  depth = depth + 2

  code('if (!OP_hybrid_gpu) return;')
  for nc in range(0,len(consts)):
    IF('!strcmp(name,"'+consts[nc]['name']+'")')
    if consts[nc]['dim'] < 0:
      IF('!strcmp(name,"'+consts[nc]['name']+'") && size>MAX_CONST_SIZE) {')
      code('printf("error: MAX_CONST_SIZE not big enough\n"); exit(1);')
      ENDIF()
    code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+', dat, dim*size));')
    ENDIF()
    code('else ')

  code('{')
  depth = depth + 2
  code('printf("error: unknown const name\\n"); exit(1);')
  ENDIF()


  depth = depth - 2
  code('}')
  code('')
  comm('user kernel files')

  for nk in range(0,len(kernels)):
    file_text = file_text +\
    '#include "'+kernels[nk]['name']+'_kernel.cu"\n'

  master = master.split('.')[0]
  fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
示例#14
0
def op2_gen_openmp_simple(master, date, consts, kernels):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  any_soa = 0
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):
    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
            ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
            unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]

#
# set two logicals
#
    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j >= 0

    j = -1
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE:
        j = i
    reduct = j >= 0

##########################################################################
#  start with the user kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

    comm('user function')

    if FORTRAN:
      code('include '+name+'.inc')
    elif CPP:
      code('#include "../'+decl_filepath+'"')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm(' host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg ARG){');
        code('')
      else:
        code('op_arg ARG,')

    for g_m in range (0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m] <> OP_READ:
        code('TYP*ARGh = (TYP *)ARG.data;')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('ARG.idx = 0;')
        code('args['+str(g_m)+'] = ARG;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')
      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = ARG;')

#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('')

#
#   indirect bits
#
    if ninds>0:
      code('int  ninds   = '+str(ninds)+';')
      line = 'int  inds['+str(nargs)+'] = {'
      for m in range(0,nargs):
        line += str(inds[m]-1)+','
      code(line[:-1]+'};')
      code('')

      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

      code('')
      comm(' get plan')
      code('#ifdef OP_PART_SIZE_'+ str(nk))
      code('  int part_size = OP_PART_SIZE_'+str(nk)+';')
      code('#else')
      code('  int part_size = OP_part_size;')
      code('#endif')
      code('')
      code('int set_size = op_mpi_halo_exchanges(set, nargs, args);')

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()
      code('')
      code('op_mpi_halo_exchanges(set, nargs, args);')

#
# set number of threads in x86 execution and create arrays for reduction
#

    if reduct or ninds==0:
      comm(' set number of threads')
      code('#ifdef _OPENMP')
      code('  int nthreads = omp_get_max_threads();')
      code('#else')
      code('  int nthreads = 1;')
      code('#endif')

    if reduct:
      code('')
      comm(' allocate and initialise arrays for global reduction')
      for g_m in range(0,nargs):
        if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE:
          code('TYP ARG_l[nthreads*64];')
          FOR('thr','0','nthreads')
          if accs[g_m]==OP_INC:
            FOR('d','0','DIM')
            code('ARG_l[d+thr*64]=ZERO_TYP;')
            ENDFOR()
          else:
            FOR('d','0','DIM')
            code('ARG_l[d+thr*64]=ARGh[d];')
            ENDFOR()
          ENDFOR()

    code('')
    IF('set->size >0')
    code('')

#
# kernel call for indirect version
#
    if ninds>0:
      code('op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0);')
      code('')
      comm(' execute plan')
      code('int block_offset = 0;')
      FOR('col','0','Plan->ncolors')
      IF('col==Plan->ncolors_core')
      code('op_mpi_wait_all(nargs, args);')
      ENDIF()
      code('int nblocks = Plan->ncolblk[col];')
      code('')
      code('#pragma omp parallel for')
      FOR('blockIdx','0','nblocks')
      code('int blockId  = Plan->blkmap[blockIdx + block_offset];')
      code('int nelem    = Plan->nelems[blockId];')
      code('int offset_b = Plan->offset[blockId];')
      FOR('n','offset_b','offset_b+nelem')
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+\
              '.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];')
      code('')
      for g_m in range (0,nargs):
        u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
        if len(u) > 0 and vectorised[g_m] > 0:
          if accs[g_m] == OP_READ:
            line = 'const TYP* ARG_vec[] = {\n'
          else:
            line = 'TYP* ARG_vec[] = {\n'

          v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
          first = [i for i in range(0,len(v)) if v[i] == 1]
          first = first[0]

          indent = ' '*(depth+2)
          for k in range(0,sum(v)):
            line = line + indent + ' &((TYP*)arg'+str(first)+'.data)[DIM * map'+str(mapinds[g_m+k])+'idx],\n'
          line = line[:-2]+'};'
          code(line)
      code('')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+' * n]'
        if maps[g_m] == OP_MAP:
          if vectorised[g_m]:
            if g_m+1 in unique_args:
                line = line + indent + 'arg'+str(g_m)+'_vec'
          else:
            line = line + indent + '&(('+typs[g_m]+'*)arg'+str(invinds[inds[g_m]-1])+'.data)['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
        if maps[g_m] == OP_GBL:
          if accs[g_m] <> OP_READ and accs[g_m] <> OP_WRITE:
            line = line + indent +'&arg'+str(g_m)+'_l[64*omp_get_thread_num()]'
          else:
            line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1:
          if g_m+1 in unique_args and not g_m+1 == unique_args[-1]:
            line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()
      ENDFOR()
      code('')

      if reduct:
        comm(' combine reduction data')
        IF('col == Plan->ncolors_owned-1')
        for m in range(0,nargs):
          if maps[m] == OP_GBL and accs[m] <> OP_READ:
            FOR('thr','0','nthreads')
            if accs[m]==OP_INC:
              FOR('d','0','DIM')
              code('ARGh[d] += ARG_l[d+thr*64];')
              ENDFOR()
            elif accs[m]==OP_MIN:
              FOR('d','0','DIM')
              code('ARGh[d]  = MIN(ARGh[d],ARG_l[d+thr*64]);')
              ENDFOR()
            elif  accs[m]==OP_MAX:
              FOR('d','0','DIM')
              code('ARGh[d]  = MAX(ARGh[d],ARG_l[d+thr*64]);')
              ENDFOR()
            else:
              error('internal error: invalid reduction option')
            ENDFOR()
        ENDIF()
      code('block_offset += nblocks;');
      ENDIF()

#
# kernel call for direct version
#
    else:
      comm(' execute plan')
      code('#pragma omp parallel for')
      FOR('thr','0','nthreads')
      code('int start  = (set->size* thr)/nthreads;')
      code('int finish = (set->size*(thr+1))/nthreads;')
      FOR('n','start','finish')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+'*n]'
        if maps[g_m] == OP_GBL:
          if accs[g_m] <> OP_READ and accs[g_m] <> OP_WRITE:
            line = line + indent +'&arg'+str(g_m)+'_l[64*omp_get_thread_num()]'
          else:
            line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()
      ENDFOR()

    if ninds>0:
      code('OP_kernels['+str(nk)+'].transfer  += Plan->transfer;')
      code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;')

    ENDIF()
    code('')

    #zero set size issues
    if ninds>0:
      IF('set_size == 0 || set_size == set->core_size')
      code('op_mpi_wait_all(nargs, args);')
      ENDIF()

#
# combine reduction data from multiple OpenMP threads, direct version
#
    comm(' combine reduction data')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE and ninds==0:
        FOR('thr','0','nthreads')
        if accs[g_m]==OP_INC:
          FOR('d','0','DIM')
          code('ARGh[d] += ARG_l[d+thr*64];')
          ENDFOR()
        elif accs[g_m]==OP_MIN:
          FOR('d','0','DIM')
          code('ARGh[d]  = MIN(ARGh[d],ARG_l[d+thr*64]);')
          ENDFOR()
        elif accs[g_m]==OP_MAX:
          FOR('d','0','DIM')
          code('ARGh[d]  = MAX(ARGh[d],ARG_l[d+thr*64]);')
          ENDFOR()
        else:
          print 'internal error: invalid reduction option'
        ENDFOR()
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
        code('op_mpi_reduce(&ARG,ARGh);')

    code('op_mpi_set_dirtybit(nargs, args);')
    code('')

#
# update kernel record
#

    comm(' update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('ARG.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' ARG.size;')
          else:
            code(line+' ARG.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()

    depth -= 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('openmp'):
        os.makedirs('openmp')
    fid = open('openmp/'+name+'_kernel.cpp','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text =''

  comm(' global constants       ')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'

      code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];')
  code('')

  comm(' header                 ')

  if os.path.exists('./user_types.h'):
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"       ')
  code('')

  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_kernel.cpp"')
  master = master.split('.')[0]
  fid = open('openmp/'+master.split('.')[0]+'_kernels.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()
示例#15
0
def op2_gen_seq(master, date, consts, kernels):

  global dims, idxs, typs, indtyps, inddims
  global FORTRAN, CPP, g_m, file_text, depth

  OP_ID   = 1;  OP_GBL   = 2;  OP_MAP = 3;

  OP_READ = 1;  OP_WRITE = 2;  OP_RW  = 3;
  OP_INC  = 4;  OP_MAX   = 5;  OP_MIN = 6;

  accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ]

  any_soa = 0
  for nk in range (0,len(kernels)):
    any_soa = any_soa or sum(kernels[nk]['soaflags'])

##########################################################################
#  create new kernel file
##########################################################################

  for nk in range (0,len(kernels)):

    name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \
            ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \
            unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk])

    optidxs = [0]*nargs
    indopts = [-1]*nargs
    nopts = 0
    for i in range(0,nargs):
      if optflags[i] == 1 and maps[i] == OP_ID:
        optidxs[i] = nopts
        nopts = nopts+1
      elif optflags[i] == 1 and maps[i] == OP_MAP:
        if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination
          optidxs[i] = nopts
          indopts[inds[i]-1] = i
          nopts = nopts+1
        else:
          optidxs[i] = optidxs[invinds[inds[i]-1]]
#
# set two logicals
#
    j = 0
    for i in range(0,nargs):
      if maps[i] == OP_MAP and accs[i] == OP_INC:
        j = i
    ind_inc = j > 0

    j = 0
    for i in range(0,nargs):
      if maps[i] == OP_GBL and accs[i] <> OP_READ:
        j = i
    reduct = j > 0

##########################################################################
#  start with the user kernel function
##########################################################################

    FORTRAN = 0;
    CPP     = 1;
    g_m = 0;
    file_text = ''
    depth = 0

    comm('user function')

    if FORTRAN:
      code('include '+name+'.inc')
    elif CPP:
      code('#include "../'+decl_filepath+'"')

##########################################################################
# then C++ stub function
##########################################################################

    code('')
    comm(' host stub function')
    code('void op_par_loop_'+name+'(char const *name, op_set set,')
    depth += 2

    for m in unique_args:
      g_m = m - 1
      if m == unique_args[len(unique_args)-1]:
        code('op_arg ARG){');
        code('')
      else:
        code('op_arg ARG,')

    code('int nargs = '+str(nargs)+';')
    code('op_arg args['+str(nargs)+'];')
    code('')

    for g_m in range (0,nargs):
      u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
      if len(u) > 0 and vectorised[g_m] > 0:
        code('ARG.idx = 0;')
        code('args['+str(g_m)+'] = ARG;')

        v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
        first = [i for i in range(0,len(v)) if v[i] == 1]
        first = first[0]
        if (optflags[g_m] == 1):
          argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, '
        else:
          argtyp = 'op_arg_dat('

        FOR('v','1',str(sum(v)))
        code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\
        str(first)+'.map, DIM, "TYP", '+accsstring[accs[g_m]-1]+');')
        ENDFOR()
        code('')
      elif vectorised[g_m]>0:
        pass
      else:
        code('args['+str(g_m)+'] = ARG;')

#
# start timing
#
    code('')
    comm(' initialise timers')
    code('double cpu_t1, cpu_t2, wall_t1, wall_t2;')
    code('op_timing_realloc('+str(nk)+');')
    code('op_timers_core(&cpu_t1, &wall_t1);')
    code('')

#
#   indirect bits
#
    if ninds>0:
      IF('OP_diags>2')
      code('printf(" kernel routine with indirection: '+name+'\\n");')
      ENDIF()

#
# direct bit
#
    else:
      code('')
      IF('OP_diags>2')
      code('printf(" kernel routine w/o indirection:  '+ name + '");')
      ENDIF()

    code('')
    code('int set_size = op_mpi_halo_exchanges(set, nargs, args);')

    code('')
    IF('set->size >0')
    code('')

#
# kernel call for indirect version
#
    if ninds>0:
      FOR('n','0','set_size')
      IF('n==set->core_size')
      code('op_mpi_wait_all(nargs, args);')
      ENDIF()
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapinds[g_m] in k):
            k = k + [mapinds[g_m]]
            code('int map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];')
      code('')
      for g_m in range (0,nargs):
        u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m]
        if len(u) > 0 and vectorised[g_m] > 0:
          if accs[g_m] == OP_READ:
            line = 'const TYP* ARG_vec[] = {\n'
          else:
            line = 'TYP* ARG_vec[] = {\n'

          v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))]
          first = [i for i in range(0,len(v)) if v[i] == 1]
          first = first[0]
        
          indent = ' '*(depth+2)
          for k in range(0,sum(v)):
            line = line + indent + ' &((TYP*)arg'+str(first)+'.data)[DIM * map'+str(mapinds[g_m+k])+'idx],\n'
          line = line[:-2]+'};'
          code(line)
      code('')

      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+' * n]'
        if maps[g_m] == OP_MAP: 
          if vectorised[g_m]:
            if g_m+1 in unique_args:
                line = line + indent + 'arg'+str(g_m)+'_vec'
          else:
            line = line + indent + '&(('+typs[g_m]+'*)arg'+str(invinds[inds[g_m]-1])+'.data)['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1: 
          if g_m+1 in unique_args and not g_m+1 == unique_args[-1]:
            line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()

#
# kernel call for direct version
#
    else:
      FOR('n','0','set_size')
      line = name+'('
      indent = '\n'+' '*(depth+2)
      for g_m in range(0,nargs):
        if maps[g_m] == OP_ID:
          line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+'*n]'
        if maps[g_m] == OP_GBL:
          line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data'
        if g_m < nargs-1:
          line = line +','
        else:
           line = line +');'
      code(line)
      ENDFOR()

    ENDIF()
    code('')

    #zero set size issues
    if ninds>0:
      IF('set_size == 0 || set_size == set->core_size')
      code('op_mpi_wait_all(nargs, args);')
      ENDIF()

#
# combine reduction data from multiple OpenMP threads
#
    comm(' combine reduction data')
    for g_m in range(0,nargs):
      if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ:
#        code('op_mpi_reduce(&ARG,('+typs[g_m]+'*)ARG.data);')
        if typs[g_m] == 'double': #need for both direct and indirect
          code('op_mpi_reduce_double(&ARG,('+typs[g_m]+'*)ARG.data);')
        elif typs[g_m] == 'float':
          code('op_mpi_reduce_float(&ARG,('+typs[g_m]+'*)ARG.data);')
        elif typs[g_m] == 'int':
          code('op_mpi_reduce_int(&ARG,('+typs[g_m]+'*)ARG.data);')
        else:
          print 'Type '+typs[g_m]+' not supported in OpenACC code generator, please add it'
          exit(-1)

    code('op_mpi_set_dirtybit(nargs, args);')
    code('')

#
# update kernel record
#

    comm(' update kernel record')
    code('op_timers_core(&cpu_t2, &wall_t2);')
    code('OP_kernels[' +str(nk)+ '].name      = name;')
    code('OP_kernels[' +str(nk)+ '].count    += 1;')
    code('OP_kernels[' +str(nk)+ '].time     += wall_t2 - wall_t1;')

    if ninds == 0:
      line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *'

      for g_m in range (0,nargs):
        if optflags[g_m]==1:
          IF('ARG.opt')
        if maps[g_m]<>OP_GBL:
          if accs[g_m]==OP_READ:
            code(line+' ARG.size;')
          else:
            code(line+' ARG.size * 2.0f;')
        if optflags[g_m]==1:
          ENDIF()
    else:
      names = []
      for g_m in range(0,ninds):
        mult=''
        if indaccs[g_m] <> OP_WRITE and indaccs[g_m] <> OP_READ:
          mult = ' * 2.0f'
        if not var[invinds[g_m]] in names:
          if optflags[g_m]==1:
            IF('arg'+str(invinds[g_m])+'.opt')
          code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[g_m])+'.size'+mult+';')
          if optflags[g_m]==1:
            ENDIF()
          names = names + [var[invinds[g_m]]]
      for g_m in range(0,nargs):
        mult=''
        if accs[g_m] <> OP_WRITE and accs[g_m] <> OP_READ:
          mult = ' * 2.0f'
        if not var[g_m] in names:
          names = names + [var[g_m]]
          if optflags[g_m]==1:
            IF('ARG.opt')
          if maps[g_m] == OP_ID:
            code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';')
          elif maps[g_m] == OP_GBL:
            code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';')
          if optflags[g_m]==1:
            ENDIF()
      if nmaps > 0:
        k = []
        for g_m in range(0,nargs):
          if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
            k = k + [mapnames[g_m]]
            code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[inds[g_m]-1])+'.map->dim * 4.0f;')

    depth -= 2
    code('}')


##########################################################################
#  output individual kernel file
##########################################################################
    if not os.path.exists('seq'):
        os.makedirs('seq')
    fid = open('seq/'+name+'_seqkernel.cpp','w')
    date = datetime.datetime.now()
    fid.write('//\n// auto-generated by op2.py\n//\n\n')
    fid.write(file_text)
    fid.close()

# end of main kernel call loop


##########################################################################
#  output one master kernel file
##########################################################################

  file_text =''

  comm(' global constants       ')

  for nc in range (0,len(consts)):
    if consts[nc]['dim']==1:
      code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
    else:
      if consts[nc]['dim'] > 0:
        num = str(consts[nc]['dim'])
      else:
        num = 'MAX_CONST_SIZE'

      code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];')
  code('')

  comm(' header                 ')

  if os.path.exists('./user_types.h'):
    code('#include "../user_types.h"')
  code('#include "op_lib_cpp.h"       ')
  code('')

  comm(' user kernel files')

  for nk in range(0,len(kernels)):
    code('#include "'+kernels[nk]['name']+'_seqkernel.cpp"')
  master = master.split('.')[0]
  fid = open('seq/'+master.split('.')[0]+'_seqkernels.cpp','w')
  fid.write('//\n// auto-generated by op2.py\n//\n\n')
  fid.write(file_text)
  fid.close()