def op2_gen_mpi_vec(master, date, consts, kernels): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1 OP_GBL = 2 OP_MAP = 3 OP_READ = 1 OP_WRITE = 2 OP_RW = 3 OP_INC = 4 OP_MAX = 5 OP_MIN = 6 accsstring = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', 'OP_MAX', 'OP_MIN'] any_soa = 0 for nk in range(0, len(kernels)): any_soa = any_soa or sum(kernels[nk]['soaflags']) ########################################################################## # create new kernel file ########################################################################## for nk in range(0, len(kernels)): name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) # # set three logicals # j = -1 for i in range(0, nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0, nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ: j = i reduct = j >= 0 j = -1 for i in range(0, nargs): if maps[i] == OP_MAP: j = i indirect_kernel = j >= 0 #################################################################################### # generate the user kernel function - creating versions for vectorisation as needed #################################################################################### FORTRAN = 0 CPP = 1 g_m = 0 file_text = '' depth = 0 # # First original version # comm('user function') file_name = decl_filepath f = open(file_name, 'r') kernel_text = f.read() file_text += kernel_text f.close() # # Modified vectorisable version if its an indirect kernel # - direct kernels can be vectorised without modification # if indirect_kernel: code('#ifdef VECTORIZE') comm('user function -- modified for vectorisation') f = open(file_name, 'r') kernel_text = f.read() f.close() kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b' + name + '\\b') i = p.search(kernel_text).start() if (i < 0): print "\n********" print "Error: cannot locate user kernel function name: " + name + " - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i + j, '{', '}') signature_text = kernel_text[i:i + j] l = signature_text[0:].find('(') head_text = signature_text[0:l] #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l + 1:m] body_text = kernel_text[i + j + 1:k] ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>: body_text = op2_gen_common.replace_local_includes_with_file_contents( body_text, os.path.dirname(master)) # check for number of arguments nargs_actual = len(signature_text.split(',')) if nargs_actual != nargs: print( 'Error parsing user kernel({0}): must have {1} arguments (instead it has {2})' .format(name, nargs, nargs_actual)) return new_signature_text = '' for i in range(0, nargs): var = signature_text.split(',')[i].strip() if maps[i] <> OP_GBL and maps[i] <> OP_ID: #remove * and add [*][SIMD_VEC] var = var.replace('*', '') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length - 1].strip() #print var2 body_text = re.sub('\*\\b' + var2 + '\\b\\s*(?!\[)', var2 + '[0]', body_text) body_text = re.sub( r'(' + var2 + '\[[\w\(\)\+\-\*\s\\\\]*\]' + ')', r'\1' + '[idx]', body_text) var = var + '[*][SIMD_VEC]' #var = var + '[restrict][SIMD_VEC]' new_signature_text += var + ', ' #add ( , idx and ) signature_text = "inline " + head_text + '( ' + new_signature_text + 'int idx ) {' #finally update name signature_text = signature_text.replace(name, name + '_vec') #print head_text #print signature_text #print body_text file_text += signature_text + body_text + '}\n' code('#endif') ########################################################################## # then C++ stub function ########################################################################## code('') comm(' host stub function') code('void op_par_loop_' + name + '(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args) - 1]: code('op_arg <ARG>){') code('') else: code('op_arg <ARG>,') code('int nargs = ' + str(nargs) + ';') code('op_arg args[' + str(nargs) + '];') code('') for g_m in range(0, nargs): u = [ i for i in range(0, len(unique_args)) if unique_args[i] - 1 == g_m ] if len(u) > 0 and vectorised[g_m] > 0: code('<ARG>.idx = 0;') code('args[' + str(g_m) + '] = <ARG>;') v = [ int(vectorised[i] == vectorised[g_m]) for i in range(0, len(vectorised)) ] first = [i for i in range(0, len(v)) if v[i] == 1] first = first[0] if (optflags[g_m] == 1): argtyp = 'op_opt_arg_dat(arg' + str(first) + '.opt, ' else: argtyp = 'op_arg_dat(' FOR('v', '1', str(sum(v))) code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m] > 0: pass else: code('args[' + str(g_m) + '] = <ARG>;') # # create aligned pointers # comm('create aligned pointers for dats') for g_m in range(0, nargs): if maps[g_m] <> OP_GBL: if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE): code('ALIGNED_<TYP> <TYP> * __restrict__ ptr'+\ str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;') #code('<TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN))) ptr'+\ #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;') code('__assume_aligned(ptr' + str(g_m) + ',<TYP>_ALIGN);') else: code('ALIGNED_<TYP> const <TYP> * __restrict__ ptr'+\ str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;') code('__assume_aligned(ptr' + str(g_m) + ',<TYP>_ALIGN);') #code('const <TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN))) ptr'+\ #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;') # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc(' + str(nk) + ');') code('op_timers_core(&cpu_t1, &wall_t1);') code('') # # indirect bits # if ninds > 0: IF('OP_diags>2') code('printf(" kernel routine with indirection: ' + name + '\\n");') ENDIF() # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: ' + name + '");') ENDIF() code('') code('int exec_size = op_mpi_halo_exchanges(set, nargs, args);') code('') IF('exec_size >0') code('') # # kernel call for indirect version # if ninds > 0: code('#ifdef VECTORIZE') code('#pragma novector') FOR2('n', '0', '(exec_size/SIMD_VEC)*SIMD_VEC', 'SIMD_VEC') #initialize globals for g_m in range(0, nargs): if maps[g_m] == OP_GBL: code('<TYP> dat{0}[SIMD_VEC];'.format(g_m)) FOR('i', '0', 'SIMD_VEC') if accs[g_m] == OP_INC: code('dat{0}[i] = 0.0;'.format(g_m)) elif accs[g_m] == OP_MAX: code('dat{0}[i] = -INFINITY;'.format(g_m)) elif accs[g_m] == OP_MIN: code('dat{0}[i] = INFINITY;'.format(g_m)) elif accs[g_m] == OP_READ: code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m)) ENDFOR() IF('n+SIMD_VEC >= set->core_size') code('op_mpi_wait_all(nargs, args);') ENDIF() for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (accs[g_m] == OP_READ \ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE \ or accs[g_m] == OP_INC): code('ALIGNED_<TYP> <TYP> dat' + str(g_m) + '[<DIM>][SIMD_VEC];') #setup gathers code('#pragma omp simd simdlen(SIMD_VEC)') FOR('i', '0', 'SIMD_VEC') if nmaps > 0: for g_m in range(0, nargs): if maps[g_m] == OP_MAP: if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE): #and (not mapinds[g_m] in k): code('int idx' + str(g_m) + '_<DIM> = <DIM> * arg' + str(invmapinds[inds[g_m] - 1]) + '.map_data[(n+i) * arg' + str(invmapinds[inds[g_m] - 1]) + '.map->dim + ' + str(idxs[g_m]) + '];') code('') for g_m in range(0, nargs): if maps[g_m] == OP_MAP: if (accs[g_m] == OP_READ or accs[g_m] == OP_RW): #and (not mapinds[g_m] in k): for d in range(0, int(dims[g_m])): code('dat' + str(g_m) + '[' + str(d) + '][i] = (ptr' + str(g_m) + ')[idx' + str(g_m) + '_<DIM> + ' + str(d) + '];') code('') elif (accs[g_m] == OP_INC): for d in range(0, int(dims[g_m])): code('dat' + str(g_m) + '[' + str(d) + '][i] = 0.0;') code('') else: #globals if (accs[g_m] == OP_INC): # for d in range(0,int(dims[g_m])): # code('dat'+str(g_m)+'[i] = 0.0;') # code('') pass ENDFOR() #kernel call code('#pragma omp simd simdlen(SIMD_VEC)') FOR('i', '0', 'SIMD_VEC') line = name + '_vec(' indent = '\n' + ' ' * (depth + 2) for g_m in range(0, nargs): if maps[g_m] == OP_ID: line = line + indent + '&(ptr' + str(g_m) + ')[' + str( dims[g_m]) + ' * (n+i)],' elif maps[g_m] == OP_GBL and accs[g_m] == OP_READ: line = line + indent + '(' + typs[g_m] + '*)arg' + str( g_m) + '.data,' elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC: line = line + indent + '&dat' + str(g_m) + '[i],' else: line = line + indent + 'dat' + str(g_m) + ',' line = line + indent + 'i);' code(line) ENDFOR() #do the scatters FOR('i', '0', 'SIMD_VEC') if nmaps > 0: for g_m in range(0, nargs): if maps[g_m] == OP_MAP: if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE): #and (not mapinds[g_m] in k): code('int idx' + str(g_m) + '_<DIM> = <DIM> * arg' + str(invmapinds[inds[g_m] - 1]) + '.map_data[(n+i) * arg' + str(invmapinds[inds[g_m] - 1]) + '.map->dim + ' + str(idxs[g_m]) + '];') code('') for g_m in range(0, nargs): if maps[g_m] == OP_MAP: if (accs[g_m] == OP_INC): for d in range(0, int(dims[g_m])): code('(ptr' + str(g_m) + ')[idx' + str(g_m) + '_<DIM> + ' + str(d) + '] += dat' + str(g_m) + '[' + str(d) + '][i];') code('') if (accs[g_m] == OP_WRITE or accs[g_m] == OP_RW): for d in range(0, int(dims[g_m])): code('(ptr' + str(g_m) + ')[idx' + str(g_m) + '_<DIM> + ' + str(d) + '] = dat' + str(g_m) + '[' + str(d) + '][i];') code('') ENDFOR() #do reductions for g_m in range(0, nargs): if maps[g_m] == OP_GBL: FOR('i', '0', 'SIMD_VEC') if accs[g_m] == OP_INC: code('*(<TYP>*)arg' + str(g_m) + '.data += dat' + str(g_m) + '[i];') elif accs[g_m] == OP_MAX: code('*(<TYP>*)arg' + str(g_m) + '.data = MAX(*(<TYP>*)arg' + str(g_m) + '.data,dat' + str(g_m) + '[i]);') elif accs[g_m] == OP_MIN: code('*(<TYP>*)arg' + str(g_m) + '.data = MIN(*(<TYP>*)arg' + str(g_m) + '.data,dat' + str(g_m) + '[i]);') ENDFOR() ENDFOR() code('') comm('remainder') FOR('n', '(exec_size/SIMD_VEC)*SIMD_VEC', 'exec_size') depth = depth - 2 code('#else') FOR('n', '0', 'exec_size') depth = depth - 2 code('#endif') depth = depth + 2 IF('n==set->core_size') code('op_mpi_wait_all(nargs, args);') ENDIF() if nmaps > 0: k = [] #print name #print maps #print mapinds for g_m in range(0, nargs): #print g_m if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map' + str(mapinds[g_m]) + 'idx = arg' + str(invmapinds[inds[g_m] - 1]) + '.map_data[n * arg' + str(invmapinds[inds[g_m] - 1]) + '.map->dim + ' + str(idxs[g_m]) + '];') code('') line = name + '(' indent = '\n' + ' ' * (depth + 2) for g_m in range(0, nargs): if maps[g_m] == OP_ID: line = line + indent + '&(ptr' + str(g_m) + ')[' + str( dims[g_m]) + ' * n]' if maps[g_m] == OP_MAP: line = line + indent + '&(ptr' + str(g_m) + ')[' + str( dims[g_m]) + ' * map' + str(mapinds[g_m]) + 'idx]' if maps[g_m] == OP_GBL: line = line + indent + '(' + typs[g_m] + '*)arg' + str( g_m) + '.data' if g_m < nargs - 1: line = line + ',' else: line = line + ');' code(line) ENDFOR() # # kernel call for direct version # else: code('#ifdef VECTORIZE') code('#pragma novector') FOR2('n', '0', '(exec_size/SIMD_VEC)*SIMD_VEC', 'SIMD_VEC') #initialize globals for g_m in range(0, nargs): if maps[g_m] == OP_GBL: code('<TYP> dat{0}[SIMD_VEC];'.format(g_m)) FOR('i', '0', 'SIMD_VEC') if accs[g_m] == OP_INC: code('dat{0}[i] = 0.0;'.format(g_m)) elif accs[g_m] == OP_MAX: code('dat{0}[i] = -INFINITY;'.format(g_m)) elif accs[g_m] == OP_MIN: code('dat{0}[i] = INFINITY;'.format(g_m)) elif accs[g_m] == OP_READ: code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m)) ENDFOR() code('#pragma omp simd simdlen(SIMD_VEC)') FOR('i', '0', 'SIMD_VEC') line = name + '(' indent = '\n' + ' ' * (depth + 2) for g_m in range(0, nargs): if maps[g_m] == OP_ID: line = line + indent + '&(ptr' + str(g_m) + ')[' + str( dims[g_m]) + ' * (n+i)]' if maps[g_m] == OP_MAP: line = line + indent + '&(ptr' + str(g_m) + ')[' + str( dims[g_m]) + ' * map' + str(mapinds[g_m]) + 'idx]' if maps[g_m] == OP_GBL: line = line + indent + '&dat' + str(g_m) + '[i]' if g_m < nargs - 1: line = line + ',' else: line = line + ');' code(line) ENDFOR() #do reductions for g_m in range(0, nargs): if maps[g_m] == OP_GBL: FOR('i', '0', 'SIMD_VEC') if accs[g_m] == OP_INC: code('*(<TYP>*)arg' + str(g_m) + '.data += dat' + str(g_m) + '[i];') elif accs[g_m] == OP_MAX: code('*(<TYP>*)arg' + str(g_m) + '.data = MAX(*(<TYP>*)arg' + str(g_m) + '.data,dat' + str(g_m) + '[i]);') elif accs[g_m] == OP_MIN: code('*(<TYP>*)arg' + str(g_m) + '.data = MIN(*(<TYP>*)arg' + str(g_m) + '.data,dat' + str(g_m) + '[i]);') ENDFOR() ENDFOR() comm('remainder') FOR('n', '(exec_size/SIMD_VEC)*SIMD_VEC', 'exec_size') depth = depth - 2 code('#else') FOR('n', '0', 'exec_size') depth = depth - 2 code('#endif') depth = depth + 2 line = name + '(' indent = '\n' + ' ' * (depth + 2) for g_m in range(0, nargs): if maps[g_m] == OP_ID: line = line + indent + '&(ptr' + str(g_m) + ')[' + str( dims[g_m]) + '*n]' if maps[g_m] == OP_GBL: line = line + indent + '(' + typs[g_m] + '*)arg' + str( g_m) + '.data' if g_m < nargs - 1: line = line + ',' else: line = line + ');' code(line) ENDFOR() ENDIF() code('') #zero set size issues if ninds > 0: IF('exec_size == 0 || exec_size == set->core_size') code('op_mpi_wait_all(nargs, args);') ENDIF() # # combine reduction data from multiple OpenMP threads # comm(' combine reduction data') for g_m in range(0, nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: code('op_mpi_reduce(&<ARG>,(' + typs[g_m] + '*)<ARG>.data);') code('op_mpi_set_dirtybit(nargs, args);') code('') # # update kernel record # comm(' update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' + str(nk) + '].name = name;') code('OP_kernels[' + str(nk) + '].count += 1;') code('OP_kernels[' + str(nk) + '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels[' + str(nk) + '].transfer += (float)set->size *' for g_m in range(0, nargs): if maps[g_m] <> OP_GBL: if accs[g_m] == OP_READ: code(line + ' <ARG>.size;') else: code(line + ' <ARG>.size * 2.0f;') else: names = [] for g_m in range(0, ninds): mult = '' if indaccs[g_m] <> OP_WRITE and indaccs[g_m] <> OP_READ: mult = ' * 2.0f' if not var[invinds[g_m]] in names: code('OP_kernels[' + str(nk) + '].transfer += (float)set->size * arg' + str(invinds[g_m]) + '.size' + mult + ';') names = names + [var[invinds[g_m]]] for g_m in range(0, nargs): mult = '' if accs[g_m] <> OP_WRITE and accs[g_m] <> OP_READ: mult = ' * 2.0f' if not var[g_m] in names: names = names + [var[invinds[g_m]]] if maps[g_m] == OP_ID: code('OP_kernels[' + str(nk) + '].transfer += (float)set->size * arg' + str(g_m) + '.size' + mult + ';') elif maps[g_m] == OP_GBL: code('OP_kernels[' + str(nk) + '].transfer += (float)set->size * arg' + str(g_m) + '.size' + mult + ';') if nmaps > 0: k = [] for g_m in range(0, nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('OP_kernels[' + str(nk) + '].transfer += (float)set->size * arg' + str(invinds[inds[g_m] - 1]) + '.map->dim * 4.0f;') depth -= 2 code('}') ########################################################################## # output individual kernel file ########################################################################## if not os.path.exists('vec'): os.makedirs('vec') fid = open('vec/' + name + '_veckernel.cpp', 'w') date = datetime.datetime.now() #fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text = '' code('#define double_ALIGN 128') code('#define float_ALIGN 64') code('#define int_ALIGN 64') code('#ifdef VECTORIZE') code('#define SIMD_VEC 4') code('#define ALIGNED_double __attribute__((aligned(double_ALIGN)))') code('#define ALIGNED_float __attribute__((aligned(float_ALIGN)))') code('#define ALIGNED_int __attribute__((aligned(int_ALIGN)))') code('#else') code('#define ALIGNED_double') code('#define ALIGNED_float') code('#define ALIGNED_int') code('#endif') code('') comm(' global constants ') for nc in range(0, len(consts)): if not consts[nc]['user_declared']: if consts[nc]['dim'] == 1: code('extern ' + consts[nc]['type'][1:-1] + ' ' + consts[nc]['name'] + ';') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code('extern ' + consts[nc]['type'][1:-1] + ' ' + consts[nc]['name'] + '[' + num + '];') code('') comm(' header ') if os.path.exists('./user_types.h'): code('#include "../user_types.h"') code('#include "op_lib_cpp.h"') code('') comm(' user kernel files') for nk in range(0, len(kernels)): code('#include "' + kernels[nk]['name'] + '_veckernel.cpp"') master = master.split('.')[0] fid = open('vec/' + master.split('.')[0] + '_veckernels.cpp', 'w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()
def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1; OP_GBL = 2; OP_MAP = 3; OP_READ = 1; OP_WRITE = 2; OP_RW = 3; OP_INC = 4; OP_MAX = 5; OP_MIN = 6; accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] inc_stage=0 op_color2=0 op_color2_force=0 ########################################################################## # create new kernel file ########################################################################## for nk in range (0,len(kernels)): name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk], inc_stage) any_soa = 0 any_soa = any_soa or sum(soaflags) # # set logicals # j = -1 for i in range(0,nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0,nargs): if maps[i] == OP_MAP and accs[i] == OP_RW: j = i ind_rw = j >= 0 if ind_rw or op_color2_force: op_color2 = 1 else: op_color2 = 0 #no staging with 2 level colouring if op_color2: inc_stage=0 optidxs = [0]*nargs indopts = [-1]*nargs nopts = 0 for i in range(0,nargs): if optflags[i] == 1 and maps[i] == OP_ID: optidxs[i] = nopts nopts = nopts+1 elif optflags[i] == 1 and maps[i] == OP_MAP: if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination optidxs[i] = nopts indopts[inds[i]-1] = i nopts = nopts+1 else: optidxs[i] = optidxs[invinds[inds[i]-1]] j = -1 for i in range(0,nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ and accs[i] <> OP_WRITE: j = i reduct = j >= 0 if inc_stage: ninds_staged = 0 inds_staged = [-1]*nargs for i in range(0,nargs): if maps[i]==OP_MAP and accs[i]==OP_INC: if inds_staged[invinds[inds[i]-1]] == -1: inds_staged[i] = ninds_staged ninds_staged = ninds_staged + 1 else: inds_staged[i] = inds_staged[invinds[inds[i]-1]] invinds_staged = [-1]*ninds_staged inddims_staged = [-1]*ninds_staged indopts_staged = [-1]*ninds_staged for i in range(0,nargs): if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1: invinds_staged[inds_staged[i]] = i inddims_staged[inds_staged[i]] = dims[i] if optflags[i] == 1: indopts_staged[inds_staged[i]] = i for i in range(0,nargs): inds_staged[i] = inds_staged[i] + 1 ########################################################################## # start with CUDA kernel function ########################################################################## FORTRAN = 0; CPP = 1; g_m = 0; file_text = '' depth = 0 #strides for SoA if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('__constant__ int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;') code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;') dir_soa = -1 for g_m in range(0,nargs): if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): code('__constant__ int direct_'+name+'_stride_OP2CONSTANT;') code('int direct_'+name+'_stride_OP2HOST=-1;') dir_soa = g_m break file_name = decl_filepath f = open(file_name, 'r') kernel_text = f.read() f.close() if CPP: includes = op2_gen_common.extract_includes(kernel_text) if len(includes) > 0: for include in includes: code(include) code("") comm('user function') kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b'+name+'\\b') i = p.search(kernel_text).start() if(i < 0): print "\n********" print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') signature_text = kernel_text[i:i+j] l = signature_text[0:].find('(') head_text = signature_text[0:l].strip() #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l+1:m] body_text = kernel_text[i+j+1:k] ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>: body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master)) # check for number of arguments if len(signature_text.split(',')) != nargs_novec: print 'Error parsing user kernel('+name+'): must have '+str(nargs)+' arguments' return for i in range(0,nargs_novec): var = signature_text.split(',')[i].strip() if kernels[nk]['soaflags'][i] and (op_color2 or not (kernels[nk]['maps'][i] == OP_MAP and kernels[nk]['accs'][i] == OP_INC)): var = var.replace('*','') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length-1].strip() if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) else: body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) for nc in range(0,len(consts)): varname = consts[nc]['name'] body_text = re.sub('\\b'+varname+'\\b', varname+'_cuda',body_text) signature_text = '__device__ '+head_text + '_gpu( '+signature_text + ') {' file_text += signature_text + body_text + '}\n' comm('') comm(' CUDA kernel function') if FORTRAN: code('subroutine op_cuda_'+name+'(') elif CPP: code('__global__ void op_cuda_'+name+'(') depth = 2 if nopts > 0: code('int optflags,') for g_m in range(0,ninds): if (indaccs[g_m]==OP_READ): code('const <INDTYP> *__restrict <INDARG>,') else: code('<INDTYP> *__restrict <INDARG>,') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('const int *__restrict opDat'+str(invinds[inds[g_m]-1])+'Map, ') for g_m in range(0,nargs): if maps[g_m] == OP_ID: if accs[g_m] == OP_READ: code('const <TYP> *__restrict <ARG>,') else: code('<TYP> *<ARG>,') elif maps[g_m] == OP_GBL: if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE: code('<TYP> *<ARG>,') elif accs[g_m] == OP_READ: code('const <TYP> *<ARG>,') if ind_inc and inc_stage==1: code('int *ind_map,') code('short *arg_map,') code('int *ind_arg_sizes,') code('int *ind_arg_offs, ') if ninds>0: if not op_color2: code('int block_offset, ') code('int *blkmap, ') code('int *offset, ') code('int *nelems, ') code('int *ncolors, ') code('int *colors, ') code('int nblocks, ') else: code('int start, ') code('int end, ') code('int *col_reord, ') code('int set_size) { ') else: code('int set_size ) {') code('') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m] <> OP_WRITE: code('<TYP> <ARG>_l[<DIM>];') if accs[g_m] == OP_INC: FOR('d','0','<DIM>') code('<ARG>_l[d]=ZERO_<TYP>;') ENDFOR() else: FOR('d','0','<DIM>') code('<ARG>_l[d]=<ARG>[d+blockIdx.x*<DIM>];') ENDFOR() elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC and not op_color2: code('<TYP> <ARG>_l[<DIM>];') if not op_color2: for m in range (1,ninds+1): g_m = m -1 v = [int(inds[i]==m) for i in range(len(inds))] v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m] if sum(v)>1 and sum(v_i)>0: #check this sum(v_i) if indaccs[m-1] == OP_INC: ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1 code('<INDTYP> *arg'+str(invinds[m-1])+'_vec['+str(ind)+'] = {'); depth += 2; for n in range(0,nargs): if inds[n] == m: g_m = n code('<ARG>_l,') depth -= 2 code('};') # # lengthy code for general case with indirection # if ninds>0 and not op_color2: code('') if inc_stage==1: for g_m in range (0,ninds): if indaccs[g_m] == OP_INC: code('__shared__ int *<INDARG>_map, <INDARG>_size;') code('__shared__ <INDTYP> *<INDARG>_s;') code('') if ind_inc: code('__shared__ int nelems2, ncolor;') code('__shared__ int nelem, offset_b;') code('') code('extern __shared__ char shared[];') code('') IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks') code('return;') ENDIF() IF('threadIdx.x==0') code('') comm('get sizes and shift pointers and direct-mapped data') code('') code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x + block_offset];') code('') code('nelem = nelems[blockId];') code('offset_b = offset[blockId];') code('') if ind_inc: code('nelems2 = blockDim.x*(1+(nelem-1)/blockDim.x);') code('ncolor = ncolors[blockId];') code('') if inc_stage==1 and ind_inc: for g_m in range (0,ninds_staged): if indopts_staged[g_m-1] > 0: IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]])) code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds_staged)+'];') if indopts_staged[g_m-1] > 0: ENDIF() code('') for m in range (1,ninds_staged+1): g_m = m - 1 c = [i for i in range(nargs) if inds_staged[i]==m] code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\ '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds_staged)+'];') code('') comm('set shared memory pointers') code('int nbytes = 0;') for g_m in range(0,ninds_staged): code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_s = ('+typs[invinds_staged[g_m]]+' *) &shared[nbytes];') if g_m < ninds_staged-1: if indopts_staged[g_m-1] > 0: IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]])) code('nbytes += ROUND_UP(ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size*sizeof('+typs[invinds_staged[g_m]]+')*'+dims[invinds_staged[g_m]]+');') if indopts_staged[g_m-1] > 0: ENDIF() ENDIF() code('__syncthreads(); // make sure all of above completed') code('') if inc_stage==1: for g_m in range(0,ninds): if indaccs[g_m] == OP_INC: FOR_INC('n','threadIdx.x','ind_ARG_size*<INDDIM>','blockDim.x') code('ind_ARG_s[n] = ZERO_<INDTYP>;') ENDFOR() if ind_inc: code('') code('__syncthreads();') code('') if ind_inc: FOR_INC('n','threadIdx.x','nelems2','blockDim.x') code('int col2 = -1;') k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') IF('n<nelem') comm('initialise local variables') for g_m in range(0,nargs): if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: FOR('d','0','<DIM>') code('<ARG>_l[d] = ZERO_<TYP>;') ENDFOR() else: FOR_INC('n','threadIdx.x','nelem','blockDim.x') k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') #non-optional maps k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k): k = k + [(0*nargs+mapinds[g_m])] #non-opt k = k + [(1*nargs+mapinds[g_m])] #opt code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];') #whatever didn't come up and is opt for g_m in range(0,nargs): if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in k)): k = k + [(optflags[g_m]*nargs+mapinds[g_m])] if optflags[g_m]==1: IF('optflags & 1<<'+str(optidxs[g_m])) code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];') if optflags[g_m]==1: ENDIF() code('') for g_m in range (0,nargs): if accs[g_m] <> OP_INC: #TODO: add opt handling here u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: if accs[g_m] == OP_READ: line = 'const <TYP>* <ARG>_vec[] = {\n' else: line = '<TYP>* <ARG>_vec[] = {\n' v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] indent = ' '*(depth+2) for k in range(0,sum(v)): if soaflags[g_m]: line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n' else: line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n' line = line[:-2]+'};' code(line) # # simple version for global coloring # elif ninds>0: code('int tid = threadIdx.x + blockIdx.x * blockDim.x;') IF('tid + start < end') code('int n = col_reord[tid + start];') comm('initialise local variables') #mapidx declarations k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') #non-optional maps k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k): k = k + [(0*nargs+mapinds[g_m])] #non-opt k = k + [(1*nargs+mapinds[g_m])] #opt code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];') #whatever didn't come up and is opt for g_m in range(0,nargs): if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in k)): k = k + [(optflags[g_m]*nargs+mapinds[g_m])] if optflags[g_m]==1: IF('optflags & 1<<'+str(optidxs[g_m])) code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];') if optflags[g_m]==1: ENDIF() for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: if accs[g_m] == OP_READ: line = 'const <TYP>* <ARG>_vec[] = {\n' else: line = '<TYP>* <ARG>_vec[] = {\n' v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] indent = ' '*(depth+2) for k in range(0,sum(v)): if soaflags[g_m]: line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n' else: line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n' line = line[:-2]+'};' code(line) # # simple alternative when no indirection # else: code('') comm('process set elements') FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x') # # kernel call # code('') comm('user-supplied kernel call') line = name+'_gpu(' prefix = ' '*len(name) a = 0 #only apply indentation if its not the 0th argument indent ='' for m in range (0, nargs): if a > 0: indent = ' '+' '*len(name) if maps[m] == OP_GBL: if accs[m] == OP_READ or accs[m] == OP_WRITE: line += rep(indent+'<ARG>,\n',m) else: line += rep(indent+'<ARG>_l,\n',m); a =a+1 elif maps[m]==OP_MAP and accs[m]==OP_INC and not op_color2: if vectorised[m]: if m+1 in unique_args: line += rep(indent+'<ARG>_vec,\n',m) else: line += rep(indent+'<ARG>_l,\n',m) a =a+1 elif maps[m]==OP_MAP: if vectorised[m]: if m+1 in unique_args: line += rep(indent+'<ARG>_vec,\n',m) else: if soaflags[m]: line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx,'+'\n',m) else: line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx*<DIM>,'+'\n',m) a =a+1 elif maps[m]==OP_ID: if ninds>0 and not op_color2: if soaflags[m]: line += rep(indent+'<ARG>+(n+offset_b),\n',m) else: line += rep(indent+'<ARG>+(n+offset_b)*<DIM>,\n',m) a =a+1 else: if soaflags[m]: line += rep(indent+'<ARG>+n,\n',m) else: line += rep(indent+'<ARG>+n*<DIM>,\n',m) a =a+1 else: print 'internal error 1 ' code(line[0:-2]+');') #remove final ',' and \n # # updating for indirect kernels ... # if ninds>0 and not op_color2: if ind_inc: code('col2 = colors[n+offset_b];') ENDIF() code('') comm('store local variables') code('') if inc_stage==1: for g_m in range(0,nargs): if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: code('int <ARG>_map;') IF('col2>=0') for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: code('<ARG>_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];') ENDIF() code('') FOR('col','0','ncolor') IF('col2==col') if inc_stage==1: for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: if optflags[g_m]==1: IF('optflags & 1<<'+str(optidxs[g_m])) for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s[<ARG>_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size];') else: code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+<ARG>_map*<DIM>];') # for g_m in range(0,nargs): # if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('ind_arg'+str(inds[g_m]-1)+'_s[<ARG>_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size] = <ARG>_l['+str(d)+'];') else: code('ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+<ARG>_map*<DIM>] = <ARG>_l['+str(d)+'];') if optflags[g_m]==1: ENDIF() else: for g_m in range(0,nargs): if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: if optflags[g_m]==1: IF('optflags & 1<<'+str(optidxs[g_m])) for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx];') else: code('<ARG>_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>];') # for g_m in range(0,nargs): # if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: for d in range(0,int(dims[g_m])): if soaflags[g_m]: code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx] = <ARG>_l['+str(d)+'];') else: code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*<DIM>] = <ARG>_l['+str(d)+'];') if optflags[g_m]==1: ENDIF() ENDFOR() code('__syncthreads();') ENDFOR() ENDFOR() if inc_stage: for g_m in range(0,ninds): if indaccs[g_m]==OP_INC: if indopts[g_m] > 0: IF('optflags & 1<<'+str(optidxs[indopts[g_m-1]])) if soaflags[invinds[g_m]]: FOR_INC('n','threadIdx.x','<INDARG>_size','blockDim.x') for d in range(0,int(dims[invinds[g_m]])): code('arg'+str(invinds[g_m])+'_l['+str(d)+'] = <INDARG>_s[n+'+str(d)+'*<INDARG>_size] + <INDARG>[<INDARG>_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'];') for d in range(0,int(dims[invinds[g_m]])): code('<INDARG>[<INDARG>_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'] = arg'+str(invinds[g_m])+'_l['+str(d)+'];') ENDFOR() else: FOR_INC('n','threadIdx.x','<INDARG>_size*<INDDIM>','blockDim.x') code('<INDARG>[n%<INDDIM>+<INDARG>_map[n/<INDDIM>]*<INDDIM>] += <INDARG>_s[n];') ENDFOR() if indopts[g_m] > 0: ENDIF() # # global reduction # if reduct: code('') comm('global reductions') code('') for m in range (0,nargs): g_m = m if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE: FOR('d','0','<DIM>') if accs[m]==OP_INC: code('op_reduction<OP_INC>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);') elif accs[m]==OP_MIN: code('op_reduction<OP_MIN>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);') elif accs[m]==OP_MAX: code('op_reduction<OP_MAX>(&<ARG>[d+blockIdx.x*<DIM>],<ARG>_l[d]);') else: print 'internal error: invalid reduction option' sys.exit(2); ENDFOR() depth -= 2 code('}') code('') ########################################################################## # then C++ stub function ########################################################################## code('') comm('host stub function') code('void op_par_loop_'+name+'(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args)-1]: code('op_arg <ARG>){') code('') else: code('op_arg <ARG>,') for g_m in range (0,nargs): if maps[g_m]==OP_GBL: code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;') code('int nargs = '+str(nargs)+';') code('op_arg args['+str(nargs)+'];') code('') for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: code('<ARG>.idx = 0;') code('args['+str(g_m)+'] = <ARG>;') v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] if (optflags[g_m] == 1): argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' else: argtyp = 'op_arg_dat(' FOR('v','1',str(sum(v))) code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m]>0: pass else: code('args['+str(g_m)+'] = <ARG>;') if nopts>0: code('int optflags = 0;') for i in range(0,nargs): if optflags[i] == 1: IF('args['+str(i)+'].opt') code('optflags |= 1<<'+str(optidxs[i])+';') ENDIF() if nopts > 30: print 'ERROR: too many optional arguments to store flags in an integer' # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc('+str(nk)+');') code('op_timers_core(&cpu_t1, &wall_t1);') code('OP_kernels[' +str(nk)+ '].name = name;') code('OP_kernels[' +str(nk)+ '].count += 1;') code('') # # indirect bits # if ninds>0: code('') code('int ninds = '+str(ninds)+';') line = 'int inds['+str(nargs)+'] = {' for m in range(0,nargs): line += str(inds[m]-1)+',' code(line[:-1]+'};') code('') IF('OP_diags>2') code('printf(" kernel routine with indirection: '+name+'\\n");') ENDIF() code('') comm('get plan') code('#ifdef OP_PART_SIZE_'+ str(nk)) code(' int part_size = OP_PART_SIZE_'+str(nk)+';') code('#else') code(' int part_size = OP_part_size;') code('#endif') code('') code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: '+ name + '");') ENDIF() code('') code('op_mpi_halo_exchanges_cuda(set, nargs, args);') IF('set->size > 0') code('') # # kernel call for indirect version # if ninds>0: if inc_stage==1 and ind_inc: code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);') elif op_color2: code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') else: code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);') code('') # # transfer constants # g = [i for i in range(0,nargs) if maps[i] == OP_GBL and (accs[i] == OP_READ or accs[i] == OP_WRITE)] if len(g)>0: comm('transfer constants to GPU') code('int consts_bytes = 0;') for m in range(0,nargs): g_m = m if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m] == OP_WRITE): code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));') code('reallocConstArrays(consts_bytes);') code('consts_bytes = 0;') for m in range(0,nargs): if maps[m]==OP_GBL and (accs[m] == OP_READ or accs[m] == OP_WRITE): g_m = m code('<ARG>.data = OP_consts_h + consts_bytes;') code('<ARG>.data_d = OP_consts_d + consts_bytes;') FOR('d','0','<DIM>') code('((<TYP> *)<ARG>.data)[d] = <ARG>h[d];') ENDFOR() code('consts_bytes += ROUND_UP(<DIM>*sizeof(<TYP>));') code('mvConstArraysToDevice(consts_bytes);') code('') #managing constants if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() if dir_soa<>-1: IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() # # transfer global reduction initial data # if ninds == 0: comm('set CUDA execution parameters') code('#ifdef OP_BLOCK_SIZE_'+str(nk)) code(' int nthread = OP_BLOCK_SIZE_'+str(nk)+';') code('#else') code(' int nthread = OP_block_size;') comm(' int nthread = 128;') code('#endif') code('') code('int nblocks = 200;') code('') if reduct: comm('transfer global reduction data to GPU') if ninds>0: code('int maxblocks = 0;') FOR('col','0','Plan->ncolors') code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);') ENDFOR() else: code('int maxblocks = nblocks;') code('int reduct_bytes = 0;') code('int reduct_size = 0;') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE: code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));') code('reduct_size = MAX(reduct_size,sizeof(<TYP>));') code('reallocReductArrays(reduct_bytes);') code('reduct_bytes = 0;') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ and accs[g_m]<>OP_WRITE: code('<ARG>.data = OP_reduct_h + reduct_bytes;') code('<ARG>.data_d = OP_reduct_d + reduct_bytes;') FOR('b','0','maxblocks') FOR('d','0','<DIM>') if accs[g_m]==OP_INC: code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = ZERO_<TYP>;') else: code('((<TYP> *)<ARG>.data)[d+b*<DIM>] = <ARG>h[d];') ENDFOR() ENDFOR() code('reduct_bytes += ROUND_UP(maxblocks*<DIM>*sizeof(<TYP>));') code('mvReductArraysToDevice(reduct_bytes);') code('') # # kernel call for indirect version # if ninds>0: comm('execute plan') if not op_color2: code('') code('int block_offset = 0;') FOR('col','0','Plan->ncolors') IF('col==Plan->ncolors_core') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() code('#ifdef OP_BLOCK_SIZE_'+str(nk)) code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';') code('#else') code('int nthread = OP_block_size;') code('#endif') code('') if op_color2: code('int start = Plan->col_offsets[0][col];') code('int end = Plan->col_offsets[0][col+1];') code('int nblocks = (end - start - 1)/nthread + 1;') else: code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],') code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);') IF('Plan->ncolblk[col] > 0') if reduct or (inc_stage==1 and ind_inc): if reduct and inc_stage==1: code('int nshared = MAX(Plan->nshared,reduct_size*nthread);') elif reduct: code('int nshared = reduct_size*nthread;') else: code('int nshared = Plan->nsharedCol[col];') code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(') else: code('op_cuda_'+name+'<<<nblocks,nthread>>>(') if nopts > 0: code('optflags,') for m in range(1,ninds+1): g_m = invinds[m-1] code('(<TYP> *)<ARG>.data_d,') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('arg'+str(g_m)+'.map_data_d, ') for g_m in range(0,nargs): if inds[g_m]==0: code('(<TYP>*)<ARG>.data_d,') if inc_stage==1 and ind_inc: code('Plan->ind_map,') code('Plan->loc_map,') code('Plan->ind_sizes,') code('Plan->ind_offs,') if op_color2: code('start,') code('end,') code('Plan->col_reord,') else: code('block_offset,') code('Plan->blkmap,') code('Plan->offset,') code('Plan->nelems,') code('Plan->nthrcol,') code('Plan->thrcol,') code('Plan->ncolblk[col],') code('set->size+set->exec_size);') code('') if reduct: comm('transfer global reduction data back to CPU') IF('col == Plan->ncolors_owned-1') code('mvReductArraysToHost(reduct_bytes);') ENDIF() if not op_color2: ENDFOR() #TODO sztem ez forditva van... code('block_offset += Plan->ncolblk[col];') ENDIF() # # kernel call for direct version # else: if reduct: code('int nshared = reduct_size*nthread;') code('op_cuda_'+name+'<<<nblocks,nthread,nshared>>>(') else: code('op_cuda_'+name+'<<<nblocks,nthread>>>(') indent = ' '#*(len(name)+42) if nopts > 0: code(indent+'optflags,') for g_m in range(0,nargs): if g_m > 0: code(indent+'(<TYP> *) <ARG>.data_d,') else: code(indent+'(<TYP> *) <ARG>.data_d,') code(indent+'set->size );') if ninds>0: code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') # # transfer global reduction initial data # if reduct: if ninds == 0: comm('transfer global reduction data back to CPU') code('mvReductArraysToHost(reduct_bytes);') for m in range(0,nargs): g_m = m if maps[m]==OP_GBL and accs[m]<>OP_READ and accs[m] <> OP_WRITE: FOR('b','0','maxblocks') FOR('d','0','<DIM>') if accs[m]==OP_INC: code('<ARG>h[d] = <ARG>h[d] + ((<TYP> *)<ARG>.data)[d+b*<DIM>];') elif accs[m]==OP_MIN: code('<ARG>h[d] = MIN(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);') elif accs[m]==OP_MAX: code('<ARG>h[d] = MAX(<ARG>h[d],((<TYP> *)<ARG>.data)[d+b*<DIM>]);') ENDFOR() ENDFOR() code('<ARG>.data = (char *)<ARG>h;') code('op_mpi_reduce(&<ARG>,<ARG>h);') for g_m in range(0,nargs): if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: code('') code('mvConstArraysToHost(consts_bytes);') break for g_m in range(0,nargs): if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: FOR('d','0','<DIM>') code('<ARG>h[d] = ((<TYP> *)<ARG>.data)[d];') ENDFOR() code('<ARG>.data = (char *)<ARG>h;') code('op_mpi_reduce(&<ARG>,<ARG>h);') ENDIF() code('op_mpi_set_dirtybit_cuda(nargs, args);') # # update kernel record # code('cutilSafeCall(cudaDeviceSynchronize());') comm('update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' for g_m in range (0,nargs): if optflags[g_m]==1: IF('<ARG>.opt') if maps[g_m]<>OP_GBL: if accs[g_m]==OP_READ: code(line+' <ARG>.size;') else: code(line+' <ARG>.size * 2.0f;') if optflags[g_m]==1: ENDIF() depth = depth - 2 code('}') ########################################################################## # output individual kernel file ########################################################################## if not os.path.exists('cuda'): os.makedirs('cuda') fid = open('cuda/'+name+'_kernel.cu','w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text = '' comm('global constants') code('#ifndef MAX_CONST_SIZE') code('#define MAX_CONST_SIZE 128') code('#endif') code('') for nc in range (0,len(consts)): if consts[nc]['dim']==1: code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda;') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda['+num+'];') code('') comm('header') if os.path.exists('./user_types.h'): code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif') code('#include "../user_types.h"') code('#include "op_lib_cpp.h"') code('#include "op_cuda_rt_support.h"') code('#include "op_cuda_reduction.h"') code('') code('void op_decl_const_char(int dim, char const *type,') code('int size, char *dat, char const *name){') depth = depth + 2 code('if (!OP_hybrid_gpu) return;') for nc in range(0,len(consts)): IF('!strcmp(name,"'+consts[nc]['name']+'")') if consts[nc]['dim'] < 0: IF('!strcmp(name,"'+consts[nc]['name']+'") && size>MAX_CONST_SIZE) {') code('printf("error: MAX_CONST_SIZE not big enough\n"); exit(1);') ENDIF() code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*size));') ENDIF() code('else ') code('{') depth = depth + 2 code('printf("error: unknown const name\\n"); exit(1);') ENDIF() depth = depth - 2 code('}') code('') comm('user kernel files') for nk in range(0,len(kernels)): file_text = file_text +\ '#include "'+kernels[nk]['name']+'_kernel.cu"\n' master = master.split('.')[0] fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()
def op2_gen_mpi_vec(master, date, consts, kernels): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1; OP_GBL = 2; OP_MAP = 3; OP_READ = 1; OP_WRITE = 2; OP_RW = 3; OP_INC = 4; OP_MAX = 5; OP_MIN = 6; accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] grouped = 0 any_soa = 0 for nk in range (0,len(kernels)): any_soa = any_soa or sum(kernels[nk]['soaflags']) ########################################################################## # create new kernel file ########################################################################## for nk in range (0,len(kernels)): name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) # # set three logicals # j = -1 for i in range(0,nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0,nargs): if maps[i] == OP_GBL and accs[i] != OP_READ: j = i reduct = j >= 0 j = -1 for i in range(0,nargs): if maps[i] == OP_MAP : j = i indirect_kernel = j >= 0 if nargs != nargs_novec: return #################################################################################### # generate the user kernel function - creating versions for vectorisation as needed #################################################################################### FORTRAN = 0; CPP = 1; g_m = 0; file_text = '' depth = 0 # # First original version # comm('user function') file_name = decl_filepath f = open(file_name, 'r') kernel_text = f.read() file_text += kernel_text f.close() ## Clang compiler can struggle to vectorize a loop if it uses a mix of ## Python-generated simd arrays for indirect data AND pointers to direct ## data. Fix by also generating simd arrays for direct data: do_gen_direct_simd_arrays = True # # Modified vectorisable version if its an indirect kernel # - direct kernels can be vectorised without modification # if indirect_kernel: code('#ifdef VECTORIZE') comm('user function -- modified for vectorisation') f = open(file_name, 'r') kernel_text = f.read() f.close() kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b'+name+'\\b') i = p.search(kernel_text).start() if(i < 0): print("\n********") print("Error: cannot locate user kernel function name: "+name+" - Aborting code generation") exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') signature_text = kernel_text[i:i+j] l = signature_text[0:].find('(') head_text = signature_text[0:l] #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l+1:m] body_text = kernel_text[i+j+1:k] ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>: body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master)) # check for number of arguments nargs_actual = len(signature_text.split(',')) if nargs_actual != nargs: print(('Error parsing user kernel({0}): must have {1} arguments (instead it has {2})'.format(name, nargs, nargs_actual))) return new_signature_text = '' for i in range(0,nargs): var = signature_text.split(',')[i].strip() if do_gen_direct_simd_arrays: do_gen_simd_array_arg = maps[i] != OP_GBL else: do_gen_simd_array_arg = maps[i] != OP_GBL and maps[i] != OP_ID if do_gen_simd_array_arg: #remove * and add [*][SIMD_VEC] var = var.replace('*','') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length-1].strip() #print var2 body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) array_access_pattern = '\[[\w\(\)\+\-\*\s\\\\]*\]' ## It has been observed that vectorisation can fail on loops with increments, ## but replacing them with writes succeeds. ## For example with Clang on particular loops, vectorisation fails with message: ## "loop not vectorized: loop control flow is not understood by vectorizer" ## replacing increments with writes solves this. ## Replacement is data-safe due to use of local/intermediate SIMD arrays. ## Hopefully the regex is matching all increments. ## And for loops that were being vectorised, this change can give a small perf boost. if maps[i] == OP_MAP and accs[i] == OP_INC: ## Replace 'var' increments with writes: body_text = re.sub(r'('+var2+array_access_pattern+'\s*'+')'+re.escape("+="), r'\1'+'=', body_text) ## Append vector array access: body_text = re.sub(r'('+var2+array_access_pattern+')', r'\1'+'[idx]', body_text) var = var + '[][SIMD_VEC]' #var = var + '[restrict][SIMD_VEC]' new_signature_text += var+', ' #add ( , idx and ) signature_text = "#if defined __clang__ || defined __GNUC__\n" signature_text += "__attribute__((always_inline))\n" signature_text += "#endif\n" signature_text += "inline " + head_text + '( '+new_signature_text + 'int idx ) {' #finally update name signature_text = signature_text.replace(name,name+'_vec') #print head_text #print signature_text #print body_text file_text += signature_text + body_text + '}\n' code('#endif'); ########################################################################## # then C++ stub function ########################################################################## code('') comm(' host stub function') code('void op_par_loop_'+name+'(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args)-1]: code('op_arg <ARG>){'); code('') else: code('op_arg <ARG>,') code('int nargs = '+str(nargs)+';') code('op_arg args['+str(nargs)+'];') code('') for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: code('<ARG>.idx = 0;') code('args['+str(g_m)+'] = <ARG>;') v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] if (optflags[g_m] == 1): argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' else: argtyp = 'op_arg_dat(' FOR('v','1',str(sum(v))) code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m]>0: pass else: code('args['+str(g_m)+'] = <ARG>;') # # create aligned pointers # comm('create aligned pointers for dats') for g_m in range (0,nargs): if maps[g_m] != OP_GBL: if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE): code('ALIGNED_<TYP> <TYP> * __restrict__ ptr'+\ str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;') #code('<TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN))) ptr'+\ #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;') code('DECLARE_PTR_ALIGNED(ptr'+str(g_m)+',<TYP>_ALIGN);') else: code('ALIGNED_<TYP> const <TYP> * __restrict__ ptr'+\ str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;') code('DECLARE_PTR_ALIGNED(ptr'+str(g_m)+',<TYP>_ALIGN);') #code('const <TYP>* __restrict__ __attribute__((align_value (<TYP>_ALIGN))) ptr'+\ #str(g_m)+' = (<TYP> *) arg'+str(g_m)+'.data;') # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc('+str(nk)+');') code('op_timers_core(&cpu_t1, &wall_t1);') code('') # # indirect bits # if ninds>0: IF('OP_diags>2') code('printf(" kernel routine with indirection: '+name+'\\n");') ENDIF() # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: '+ name + '");') ENDIF() code('') if grouped: code('int exec_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 1);') else: code('int exec_size = op_mpi_halo_exchanges(set, nargs, args);') code('') IF('exec_size >0') code('') # # kernel call for indirect version # if ninds>0: code('#ifdef VECTORIZE') code('#pragma novector') FOR2('n','0','(exec_size/SIMD_VEC)*SIMD_VEC','SIMD_VEC') #initialize globals for g_m in range(0,nargs): if maps[g_m] == OP_GBL: code('<TYP> dat{0}[SIMD_VEC];'.format(g_m)) FOR('i','0','SIMD_VEC') if accs[g_m] == OP_INC: code('dat{0}[i] = 0.0;'.format(g_m)) elif accs[g_m] == OP_MAX: code('dat{0}[i] = -INFINITY;'.format(g_m)) elif accs[g_m] == OP_MIN: code('dat{0}[i] = INFINITY;'.format(g_m)) elif accs[g_m] == OP_READ: code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m)) ENDFOR() code('if (n<set->core_size && n>0 && n % OP_mpi_test_frequency == 0)') code(' op_mpi_test_all(nargs,args);') IF('(n+SIMD_VEC >= set->core_size) && (n+SIMD_VEC-set->core_size < SIMD_VEC)') if grouped: code('op_mpi_wait_all_grouped(nargs, args, 1);') else: code('op_mpi_wait_all(nargs, args);') ENDIF() for g_m in range(0,nargs): if do_gen_direct_simd_arrays: if (maps[g_m] in [OP_MAP, OP_ID]) and (accs[g_m] in [OP_READ, OP_RW, OP_WRITE, OP_INC]): code('ALIGNED_<TYP> <TYP> dat'+str(g_m)+'[<DIM>][SIMD_VEC];') else: if maps[g_m] == OP_MAP and (accs[g_m] in [OP_READ, OP_RW, OP_WRITE, OP_INC]): code('ALIGNED_<TYP> <TYP> dat'+str(g_m)+'[<DIM>][SIMD_VEC];') #setup gathers idx_map_template = "int idx{0}_<DIM> = <DIM> * arg{1}.map_data[(n+i) * arg{1}.map->dim + {2}];" idx_id_template = "int idx{0}_<DIM> = <DIM> * (n+i);" code('#pragma omp simd simdlen(SIMD_VEC)') FOR('i','0','SIMD_VEC') if nmaps > 0: for g_m in range(0,nargs): if maps[g_m] == OP_MAP : if (accs[g_m] in [OP_READ, OP_RW, OP_WRITE]):#and (not mapinds[g_m] in k): code(idx_map_template.format(g_m, invmapinds[inds[g_m]-1], idxs[g_m])) elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID : code(idx_id_template.format(g_m)) code('') init_dat_template = "dat{0}[{1}][i] = (ptr{0})[idx{0}_<DIM> + {1}];" zero_dat_template = "dat{0}[{1}][i] = 0.0;" for g_m in range(0,nargs): if do_gen_direct_simd_arrays: ## also 'gather' directly-accessed data, because SOME compilers ## struggle to vectorise otherwise (e.g. Clang). if maps[g_m] != OP_GBL : if accs[g_m] in [OP_READ, OP_RW]: for d in range(0,int(dims[g_m])): code(init_dat_template.format(g_m, d)) code('') elif accs[g_m] == OP_INC: for d in range(0,int(dims[g_m])): code(zero_dat_template.format(g_m, d)) code('') else: if maps[g_m] == OP_MAP : if accs[g_m] in [OP_READ, OP_RW]:#and (not mapinds[g_m] in k): for d in range(0,int(dims[g_m])): init_dat_str = init_dat_template.format(g_m, d) code(init_dat_str) code('') elif (accs[g_m] == OP_INC): for d in range(0,int(dims[g_m])): zero_dat_str = zero_dat_template.format(g_m, d) code(zero_dat_str) code('') else: #globals if (accs[g_m] == OP_INC): # for d in range(0,int(dims[g_m])): # code('dat'+str(g_m)+'[i] = 0.0;') # code('') pass ENDFOR() #kernel call code('#pragma omp simd simdlen(SIMD_VEC)') FOR('i','0','SIMD_VEC') line = name+'_vec(' indent = '\n'+' '*(depth+2) for g_m in range(0,nargs): if (not do_gen_direct_simd_arrays) and maps[g_m] == OP_ID: line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)],' elif maps[g_m] == OP_GBL and accs[g_m] == OP_READ: line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data,' elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC: line = line + indent +'&dat'+str(g_m)+'[i],' else: line = line + indent + 'dat'+str(g_m)+',' line = line +indent +'i);' code(line) ENDFOR() #do the scatters FOR('i','0','SIMD_VEC') if nmaps > 0: for g_m in range(0,nargs): if maps[g_m] == OP_MAP : if (accs[g_m] in [OP_INC, OP_RW, OP_WRITE]):#and (not mapinds[g_m] in k): code(idx_map_template.format(g_m, invmapinds[inds[g_m]-1], idxs[g_m])) elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID : if (accs[g_m] in [OP_INC, OP_RW, OP_WRITE]): code(idx_id_template.format(g_m)) code('') dat_scatter_inc_template = "(ptr{0})[idx{0}_<DIM> + {1}] += dat{0}[{1}][i];" dat_scatter_wr_template = "(ptr{0})[idx{0}_<DIM> + {1}] = dat{0}[{1}][i];" for g_m in range(0,nargs): if maps[g_m] == OP_MAP : if (accs[g_m] == OP_INC ): for d in range(0,int(dims[g_m])): code(dat_scatter_inc_template.format(g_m, d)) code('') elif accs[g_m] in [OP_WRITE, OP_RW]: for d in range(0,int(dims[g_m])): code(dat_scatter_wr_template.format(g_m, d)) code('') elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID: ## also scatter directly-written data if (accs[g_m] == OP_INC ): for d in range(0,int(dims[g_m])): code(dat_scatter_inc_template.format(g_m, d)) elif accs[g_m] in [OP_WRITE, OP_RW]: for d in range(0,int(dims[g_m])): code(dat_scatter_wr_template.format(g_m, d)) code('') ENDFOR() #do reductions for g_m in range(0,nargs): if maps[g_m] == OP_GBL: FOR('i','0','SIMD_VEC') if accs[g_m] == OP_INC: code('*(<TYP>*)arg'+str(g_m)+'.data += dat'+str(g_m)+'[i];') elif accs[g_m] == OP_MAX: code('*(<TYP>*)arg'+str(g_m)+'.data = MAX(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);') elif accs[g_m] == OP_MIN: code('*(<TYP>*)arg'+str(g_m)+'.data = MIN(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);') ENDFOR() ENDFOR() code('') comm('remainder') FOR('n','(exec_size/SIMD_VEC)*SIMD_VEC','exec_size') depth = depth -2 code('#else') FOR('n','0','exec_size') depth = depth -2 code('#endif') depth = depth +2 IF('n==set->core_size') if grouped: code('op_mpi_wait_all_grouped(nargs, args, 1);') else: code('op_mpi_wait_all(nargs, args);') ENDIF() if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') #do non-optional ones if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]): k = k + [mapinds[g_m]] code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') #do optional ones if nmaps > 0: for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): if optflags[g_m]: IF('<ARG>.opt') else: k = k + [mapinds[g_m]] code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') if optflags[g_m]: ENDIF() code('') line = name+'(' indent = '\n'+' '*(depth+2) for g_m in range(0,nargs): if maps[g_m] == OP_ID: line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * n]' if maps[g_m] == OP_MAP: line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' if maps[g_m] == OP_GBL: line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' if g_m < nargs-1: line = line +',' else: line = line +');' code(line) ENDFOR() # # kernel call for direct version # else: code('#ifdef VECTORIZE') code('#pragma novector') FOR2('n','0','(exec_size/SIMD_VEC)*SIMD_VEC','SIMD_VEC') #initialize globals for g_m in range(0,nargs): if maps[g_m] == OP_GBL: code('<TYP> dat{0}[SIMD_VEC];'.format(g_m)) FOR('i','0','SIMD_VEC') if accs[g_m] == OP_INC: code('dat{0}[i] = 0.0;'.format(g_m)) elif accs[g_m] == OP_MAX: code('dat{0}[i] = -INFINITY;'.format(g_m)) elif accs[g_m] == OP_MIN: code('dat{0}[i] = INFINITY;'.format(g_m)) elif accs[g_m] == OP_READ: code('dat{0}[i] = *((<TYP>*)arg{0}.data);'.format(g_m)) ENDFOR() code('#pragma omp simd simdlen(SIMD_VEC)') FOR('i','0','SIMD_VEC') line = name+'(' indent = '\n'+' '*(depth+2) for g_m in range(0,nargs): if maps[g_m] == OP_ID: line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)]' if maps[g_m] == OP_MAP: line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' if maps[g_m] == OP_GBL: line = line + indent +'&dat'+str(g_m)+'[i]' if g_m < nargs-1: line = line +',' else: line = line +');' code(line) ENDFOR() #do reductions for g_m in range(0,nargs): if maps[g_m] == OP_GBL: FOR('i','0','SIMD_VEC') if accs[g_m] == OP_INC: code('*(<TYP>*)arg'+str(g_m)+'.data += dat'+str(g_m)+'[i];') elif accs[g_m] == OP_MAX: code('*(<TYP>*)arg'+str(g_m)+'.data = MAX(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);') elif accs[g_m] == OP_MIN: code('*(<TYP>*)arg'+str(g_m)+'.data = MIN(*(<TYP>*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);') ENDFOR() ENDFOR() comm('remainder') FOR ('n','(exec_size/SIMD_VEC)*SIMD_VEC','exec_size') depth = depth -2 code('#else') FOR('n','0','exec_size') depth = depth -2 code('#endif') depth = depth +2 line = name+'(' indent = '\n'+' '*(depth+2) for g_m in range(0,nargs): if maps[g_m] == OP_ID: line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+'*n]' if maps[g_m] == OP_GBL: line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' if g_m < nargs-1: line = line +',' else: line = line +');' code(line) ENDFOR() ENDIF() code('') #zero set size issues if ninds>0: IF('exec_size == 0 || exec_size == set->core_size') if grouped: code('op_mpi_wait_all_grouped(nargs, args, 1);') else: code('op_mpi_wait_all(nargs, args);') ENDIF() # # combine reduction data from multiple OpenMP threads # comm(' combine reduction data') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ: code('op_mpi_reduce(&<ARG>,('+typs[g_m]+'*)<ARG>.data);') code('op_mpi_set_dirtybit(nargs, args);') code('') # # update kernel record # comm(' update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' +str(nk)+ '].name = name;') code('OP_kernels[' +str(nk)+ '].count += 1;') code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' for g_m in range (0,nargs): if maps[g_m]!=OP_GBL: if accs[g_m]==OP_READ: code(line+' <ARG>.size;') else: code(line+' <ARG>.size * 2.0f;') else: names = [] for g_m in range(0,ninds): mult='' if indaccs[g_m] != OP_WRITE and indaccs[g_m] != OP_READ: mult = ' * 2.0f' if not var[invinds[g_m]] in names: code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[g_m])+'.size'+mult+';') names = names + [var[invinds[g_m]]] for g_m in range(0,nargs): mult='' if accs[g_m] != OP_WRITE and accs[g_m] != OP_READ: mult = ' * 2.0f' if not var[g_m] in names: names = names + [var[invinds[g_m]]] if maps[g_m] == OP_ID: code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';') elif maps[g_m] == OP_GBL: code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[inds[g_m]-1])+'.map->dim * 4.0f;') depth -= 2 code('}') ########################################################################## # output individual kernel file ########################################################################## if not os.path.exists('vec'): os.makedirs('vec') fid = open('vec/'+name+'_veckernel.cpp','w') date = datetime.datetime.now() #fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text ='' code('#define double_ALIGN 128') code('#define float_ALIGN 64') code('#define int_ALIGN 64') code('#ifdef VECTORIZE') code('#define SIMD_VEC 4') code('#define ALIGNED_double __attribute__((aligned(double_ALIGN)))') code('#define ALIGNED_float __attribute__((aligned(float_ALIGN)))') code('#define ALIGNED_int __attribute__((aligned(int_ALIGN)))') code(' #ifdef __ICC') code(' #define DECLARE_PTR_ALIGNED(X, Y) __assume_aligned(X, Y)') code(' #else') code(' #define DECLARE_PTR_ALIGNED(X, Y)') code(' #endif') code('#else') code('#define ALIGNED_double') code('#define ALIGNED_float') code('#define ALIGNED_int') code('#define DECLARE_PTR_ALIGNED(X, Y)') code('#endif') code('') comm(' global constants ') for nc in range (0,len(consts)): if not consts[nc]['user_declared']: if consts[nc]['dim']==1: code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') else: if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') code('') comm(' header ') if os.path.exists('./user_types.h'): code('#include "../user_types.h"') code('#include "op_lib_cpp.h"') code('') comm(' user kernel files') for nk in range(0,len(kernels)): code('#include "'+kernels[nk]['name']+'_veckernel.cpp"') master = master.split('.')[0] fid = open('vec/'+master.split('.')[0]+'_veckernels.cpp','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()
def op2_gen_openmp4(master, date, consts, kernels): global dims, idxs, typs, indtyps, inddims global FORTRAN, CPP, g_m, file_text, depth OP_ID = 1; OP_GBL = 2; OP_MAP = 3; OP_READ = 1; OP_WRITE = 2; OP_RW = 3; OP_INC = 4; OP_MAX = 5; OP_MIN = 6; accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] op2_compiler = os.getenv('OP2_COMPILER','0'); any_soa = 0 maptype = 'map' for nk in range (0,len(kernels)): any_soa = any_soa or sum(kernels[nk]['soaflags']) ########################################################################## # create new kernel file ########################################################################## for nk in range (0,len(kernels)): name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) optidxs = [0]*nargs indopts = [-1]*nargs nopts = 0 for i in range(0,nargs): if optflags[i] == 1 and maps[i] == OP_ID: optidxs[i] = nopts nopts = nopts+1 elif optflags[i] == 1 and maps[i] == OP_MAP: if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination optidxs[i] = nopts indopts[inds[i]-1] = i nopts = nopts+1 else: optidxs[i] = optidxs[invinds[inds[i]-1]] # # set two logicals # j = -1 for i in range(0,nargs): if maps[i] == OP_MAP and accs[i] == OP_INC: j = i ind_inc = j >= 0 j = -1 for i in range(0,nargs): if maps[i] == OP_GBL and accs[i] <> OP_READ: j = i reduct = j >= 0 ########################################################################## # start with the user kernel function ########################################################################## FORTRAN = 0; CPP = 1; g_m = 0; file_text = '' depth = 0 comm('user function') #strides for SoA if any_soa: if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;') code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;') dir_soa = -1 for g_m in range(0,nargs): if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): code('int direct_'+name+'_stride_OP2CONSTANT;') code('int direct_'+name+'_stride_OP2HOST=-1;') dir_soa = g_m break comm('user function') file_name = decl_filepath f = open(file_name, 'r') kernel_text = f.read() f.close() kernel_text = op2_gen_common.comment_remover(kernel_text) kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) p = re.compile('void\\s+\\b'+name+'\\b') i = p.search(kernel_text).start() if(i < 0): print "\n********" print "Error: cannot locate user kernel function name: "+name+" - Aborting code generation" exit(2) i2 = i #i = kernel_text[0:i].rfind('\n') #reverse find j = kernel_text[i:].find('{') k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') signature_text = kernel_text[i:i+j] l = signature_text[0:].find('(') head_text = signature_text[0:l] #save function name m = op2_gen_common.para_parse(signature_text, 0, '(', ')') signature_text = signature_text[l+1:m] body_text = kernel_text[i+j+1:k] ## Replace occurrences of '#include "<FILE>"' within loop with the contents of <FILE>: body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master)) # check for number of arguments if len(signature_text.split(',')) != nargs_novec: print 'Error parsing user kernel(%s): must have %d arguments' \ % name, nargs return for i in range(0,nargs_novec): var = signature_text.split(',')[i].strip() if kernels[nk]['soaflags'][i]: var = var.replace('*','') #locate var in body and replace by adding [idx] length = len(re.compile('\\s+\\b').split(var)) var2 = re.compile('\\s+\\b').split(var)[length-1].strip() if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) else: body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) for nc in range(0,len(consts)): varname = consts[nc]['name'] body_text = re.sub('\\b'+varname+'\\b', varname+'_ompkernel',body_text) # if consts[nc]['dim'] == 1: # body_text = re.sub(varname+'(?!\w)', varname+'_ompkernel', body_text) # else: # body_text = re.sub('\*'+varname+'(?!\[)', varname+'[0]', body_text) # body_text = re.sub(r''+varname+'\[([A-Za-z0-9]*)\]'+'', varname+r'_ompkernel[\1]', body_text) vec = 0 for n in range(0,nargs): if (vectorised[n] == 1): vec = 1 kernel_params = [ var.strip() for var in signature_text.split(',')] if vec: new_kernel_params = [] for m in range(0,nargs_novec): if int(kernels[nk]['idxs'][m])<0 and int(kernels[nk]['maps'][m]) == OP_MAP: new_kernel_params = new_kernel_params + [kernel_params[m]]*int(-1*int(kernels[nk]['idxs'][m])) else: new_kernel_params = new_kernel_params + [kernel_params[m]] kernel_params = new_kernel_params # collect constants used by kernel kernel_consts = [] for nc in range(0,len(consts)): if body_text.find(consts[nc]['name']+'_ompkernel') != -1: kernel_consts.append(nc) ############################################################ # omp4 function call definition ############################################################ code('') func_call_signaure_text = 'void ' + name + '_omp4_kernel(' params = '' indent = '\n' + ' ' k = [] for g_m in range(0, nargs): if maps[g_m] == OP_GBL: params += indent + rep('<TYP> *<ARG>,',g_m) if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): k = k + [invmapinds[inds[g_m]-1]] params += indent + 'int *map'+str(mapinds[g_m])+',' if maptype == 'map': params += indent + 'int map'+str(mapinds[g_m])+'size,' if maps[g_m] == OP_ID: params += indent + rep('<TYP> *data'+str(g_m)+',', g_m) if maptype == 'map': params += indent + 'int dat'+str(g_m)+'size,' for m in range(1,ninds+1): g_m = invinds[m-1] params += indent + rep('<TYP> *data'+str(g_m)+',', g_m) if maptype == 'map': params += indent + 'int dat'+str(g_m)+'size,' if ninds>0: # add indirect kernel specific params to kernel func call params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,' else: # add direct kernel specific params to kernel func call params += indent + 'int count,' params += indent + 'int num_teams,' + indent + 'int nthread' #add strides for SoA to params if any_soa: indent = ','+indent if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] params += indent + 'int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT' if dir_soa<>-1: params += indent + 'int direct_'+name+'_stride_OP2CONSTANT' if nopts>0: params += ', int optflags' code(func_call_signaure_text+params+');') ########################################################################## # then C++ stub function ########################################################################## code('') comm(' host stub function') code('void op_par_loop_'+name+'(char const *name, op_set set,') depth += 2 for m in unique_args: g_m = m - 1 if m == unique_args[len(unique_args)-1]: code('op_arg <ARG>){'); code('') else: code('op_arg <ARG>,') for g_m in range (0,nargs): if maps[g_m]==OP_GBL: #and accs[g_m] <> OP_READ: code('<TYP>*<ARG>h = (<TYP> *)<ARG>.data;') code('int nargs = '+str(nargs)+';') code('op_arg args['+str(nargs)+'];') code('') for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: code('<ARG>.idx = 0;') code('args['+str(g_m)+'] = <ARG>;') v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] if (optflags[g_m] == 1): argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' else: argtyp = 'op_arg_dat(' FOR('v','1',str(sum(v))) code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ str(first)+'.map, <DIM>, "<TYP>", '+accsstring[accs[g_m]-1]+');') ENDFOR() code('') elif vectorised[g_m]>0: pass else: code('args['+str(g_m)+'] = <ARG>;') if nopts>0: code('int optflags = 0;') for i in range(0,nargs): if optflags[i] == 1: IF('args['+str(i)+'].opt') code('optflags |= 1<<'+str(optidxs[i])+';') ENDIF() if nopts > 30: print 'ERROR: too many optional arguments to store flags in an integer' # # start timing # code('') comm(' initialise timers') code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') code('op_timing_realloc('+str(nk)+');') code('op_timers_core(&cpu_t1, &wall_t1);') code('OP_kernels[' +str(nk)+ '].name = name;') code('OP_kernels[' +str(nk)+ '].count += 1;') code('') # # indirect bits # if ninds>0: code('int ninds = '+str(ninds)+';') line = 'int inds['+str(nargs)+'] = {' for m in range(0,nargs): line += str(inds[m]-1)+',' code(line[:-1]+'};') code('') IF('OP_diags>2') code('printf(" kernel routine with indirection: '+name+'\\n");') ENDIF() code('') comm(' get plan') code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # direct bit # else: code('') IF('OP_diags>2') code('printf(" kernel routine w/o indirection: '+ name + '");') ENDIF() code('') code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') # # get part and block size # code('') code('#ifdef OP_PART_SIZE_'+ str(nk)) code(' int part_size = OP_PART_SIZE_'+str(nk)+';') code('#else') code(' int part_size = OP_part_size;') code('#endif') code('#ifdef OP_BLOCK_SIZE_'+ str(nk)) code(' int nthread = OP_BLOCK_SIZE_'+str(nk)+';') code('#else') code(' int nthread = OP_block_size;') code('#endif') code('') for g_m in range(0,nargs): if maps[g_m]==OP_GBL: #and accs[g_m]<>OP_READ: if not dims[g_m].isdigit() or int(dims[g_m]) > 1: print 'ERROR: OpenMP 4 does not support multi-dimensional variables' exit(-1) code('<TYP> <ARG>_l = <ARG>h[0];') if ninds > 0: code('') code('int ncolors = 0;') code('int set_size1 = set->size + set->exec_size;') code('') IF('set_size >0') #managing constants if any_soa: code('') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST;') ENDIF() if dir_soa<>-1: IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') code('direct_'+name+'_stride_OP2CONSTANT = direct_'+name+'_stride_OP2HOST;') ENDIF() code('') comm('Set up typed device pointers for OpenMP') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): k = k + [invmapinds[inds[g_m]-1]] code('int *map'+str(mapinds[g_m])+' = arg'+str(invmapinds[inds[g_m]-1])+'.map_data_d;') if maptype == 'map': code(' int map'+str(mapinds[g_m])+'size = arg'+str(invmapinds[inds[g_m]-1])+'.map->dim * set_size1;') code('') for g_m in range(0,nargs): if maps[g_m] == OP_ID: code(typs[g_m]+'* data'+str(g_m)+' = ('+typs[g_m]+'*)arg'+str(g_m)+'.data_d;') if maptype == 'map': if optflags[g_m]: code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') else: code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') for m in range(1,ninds+1): g_m = invinds[m-1] code('<TYP> *data'+str(g_m)+' = (<TYP> *)<ARG>.data_d;') if maptype == 'map': if optflags[g_m]: code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') else: code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') # # prepare kernel params for indirect version # if ninds>0: code('') code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') code('ncolors = Plan->ncolors;') code('int *col_reord = Plan->col_reord;') code('') comm(' execute plan') FOR('col','0','Plan->ncolors') IF('col==1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() code('int start = Plan->col_offsets[0][col];') code('int end = Plan->col_offsets[0][col+1];') code('') # # kernel function call # indent = '\n' + ' ' * (depth+2) call_params = ','.join([ indent + re.sub(r'\*arg(\d+)',r'&arg\1_l',param.strip().split(' ')[-1]) for param in params.split(',')]) call_params = call_params.replace('*','') # set params for indirect version if ninds>0: call_params = call_params.replace('num_teams','part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread') # set params for direct version else: call_params = re.sub('count','set->size',call_params); call_params = call_params.replace('num_teams','part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread') code(func_call_signaure_text.split(' ')[-1]+call_params+');') code('') if ninds>0: if reduct: comm(' combine reduction data') IF('col == Plan->ncolors_owned-1') for g_m in range(0,nargs): if maps[g_m] == OP_GBL and accs[g_m] <> OP_READ: if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: code('<ARG>h[0] = <ARG>_l;') elif accs[g_m]==OP_MIN: code('<ARG>h[0] = MIN(<ARG>h[0],<ARG>_l);') elif accs[g_m]==OP_MAX: code('<ARG>h[0] = MAX(<ARG>h[0],<ARG>_l);') else: error('internal error: invalid reduction option') ENDIF() ENDFOR() code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') ENDIF() code('') #zero set size issues if ninds>0: IF('set_size == 0 || set_size == set->core_size || ncolors == 1') code('op_mpi_wait_all_cuda(nargs, args);') ENDIF() # # combine reduction data from multiple OpenMP threads # comm(' combine reduction data') for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ: if ninds==0: #direct version only if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: code('<ARG>h[0] = <ARG>_l;') elif accs[g_m]==OP_MIN: code('<ARG>h[0] = MIN(<ARG>h[0],<ARG>_l);') elif accs[g_m]==OP_MAX: code('<ARG>h[0] = MAX(<ARG>h[0],<ARG>_l);') else: print 'internal error: invalid reduction option' if typs[g_m] == 'double': #need for both direct and indirect code('op_mpi_reduce_double(&<ARG>,<ARG>h);') elif typs[g_m] == 'float': code('op_mpi_reduce_float(&<ARG>,<ARG>h);') elif typs[g_m] == 'int': code('op_mpi_reduce_int(&<ARG>,<ARG>h);') else: print 'Type '+typs[g_m]+' not supported in OpenMP4 code generator, please add it' exit(-1) code('op_mpi_set_dirtybit_cuda(nargs, args);') code('') # # update kernel record # code('if (OP_diags>1) deviceSync();') comm(' update kernel record') code('op_timers_core(&cpu_t2, &wall_t2);') code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') if ninds == 0: line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' for g_m in range (0,nargs): if optflags[g_m]==1: IF('<ARG>.opt') if maps[g_m]<>OP_GBL: if accs[g_m]==OP_READ: code(line+' <ARG>.size;') else: code(line+' <ARG>.size * 2.0f;') if optflags[g_m]==1: ENDIF() depth -= 2 code('}') ########################################################################## # output individual kernel file ########################################################################## if not os.path.exists('openmp4'): os.makedirs('openmp4') fid = open('openmp4/'+name+'_omp4kernel.cpp','w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() ############################################################## # generate ****_omp4kernel_func.cpp ############################################################## file_text = '' if CPP: includes = op2_gen_common.extract_includes(kernel_text) if len(includes) > 0: for include in includes: code(include) code("") code(func_call_signaure_text+params+'){') code('') depth += 2 for g_m in range(0, nargs): if maps[g_m] == OP_GBL: code('<TYP> <ARG>_l = *<ARG>;') line = '#pragma omp target teams' if op2_compiler == 'clang': line +=' distribute parallel for schedule(static,1)\\\n' + (depth+2)*' ' line +=' num_teams(num_teams) thread_limit(nthread) ' map_clause = '' if maptype == 'map': map_clause = 'map(to:' elif maptype == 'is_device_ptr': map_clause = 'is_device_ptr(' for g_m in range(0,nargs): if maps[g_m] == OP_ID: if maptype == 'map': map_clause += 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],' else: map_clause += 'data'+str(g_m)+',' if map_clause != 'is_device_ptr(' and map_clause != 'map(to:': map_clause = map_clause[:-1]+')' line += map_clause # mapping global consts if len(kernel_consts) != 0: line += ' \\\n' + (depth+2)*' ' + 'map(to:' for nc in kernel_consts: line += ' ' + consts[nc]['name']+'_ompkernel,' if consts[nc]['dim'] != 1: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' line = line[:-1] + '[:'+ num +'],' line = line[:-1]+')' # prepare reduction reduction_string = '' reduction_mapping = '' if reduct: reduction_mapping ='\\\n'+(depth+2)*' '+ 'map(tofrom:' for g_m in range(0,nargs): if maps[g_m]==OP_GBL and accs[g_m]<>OP_READ: if accs[g_m] == OP_INC: reduction_string += ' reduction(+:arg%d_l)' % g_m reduction_mapping += ' arg%d_l,' % g_m if accs[g_m] == OP_MIN: reduction_string += ' reduction(min:arg%d_l)' % g_m reduction_mapping += ' arg%d_l,' % g_m if accs[g_m] == OP_MAX: reduction_string += ' reduction(max:arg%d_l)' % g_m reduction_mapping += ' arg%d_l,' % g_m if accs[g_m] == OP_WRITE: reduction_mapping += ' arg%d_l,' % g_m reduction_mapping = reduction_mapping[0:-1]+')' # # map extra pointers for indirect version # if ninds>0: if maptype == 'map': line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],' else: line += '\\\n'+(depth+2)*' '+'map(to:col_reord,' if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): k = k + [invmapinds[inds[g_m]-1]] if maptype == 'map': line = line + 'map'+str(mapinds[g_m])+'[0:map'+str(mapinds[g_m])+'size],' else: line = line + 'map'+str(mapinds[g_m])+',' for m in range(1,ninds+1): g_m = invinds[m-1] if maptype == 'map': line = line + 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],' else: line = line + 'data'+str(g_m)+',' line = line[:-1]+')' # # write omp pragma # code(line + reduction_mapping + reduction_string) if op2_compiler != 'clang': line = '#pragma omp distribute parallel for schedule(static,1)' code(line + reduction_string) # # start for loop indirect version # if ninds>0: FOR('e','start','end') code('int n_op = col_reord[e];') if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] code('int map'+str(mapinds[g_m])+'idx;') #do non-optional ones if nmaps > 0: k = [] for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]): k = k + [mapinds[g_m]] code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ '[n_op + set_size1 * '+str(idxs[g_m])+'];') #do optional ones if nmaps > 0: for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): if optflags[g_m]: IF('optflags & 1<<'+str(optidxs[g_m])) else: k = k + [mapinds[g_m]] code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ '[n_op + set_size1 * '+str(idxs[g_m])+'];') if optflags[g_m]: ENDIF() code('') for g_m in range (0,nargs): u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] if len(u) > 0 and vectorised[g_m] > 0: if accs[g_m] == OP_READ: line = 'const <TYP>* <ARG>_vec[] = {\n' else: line = '<TYP>* <ARG>_vec[] = {\n' v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] first = [i for i in range(0,len(v)) if v[i] == 1] first = first[0] indent = ' '*(depth+2) for k in range(0,sum(v)): if soaflags[g_m]: line = line + indent + ' &data'+str(first)+'[map'+str(mapinds[g_m+k])+'idx],\n' else: line = line + indent + ' &data'+str(first)+'[<DIM> * map'+str(mapinds[g_m+k])+'idx],\n' line = line[:-2]+'};' code(line) # # direct version # else: FOR('n_op','0','count') # # write inlined kernel function # comm('variable mapping') for g_m in range(0,nargs): line = kernel_params[g_m] + ' = ' if maps[g_m] == OP_ID: if soaflags[g_m]: line += '&data%d[n_op]' % g_m else: line += '&data'+str(g_m)+'['+str(dims[g_m])+'*n_op]' if maps[g_m] == OP_MAP: if vectorised[g_m]: if g_m+1 in unique_args: line += 'arg'+str(g_m)+'_vec' else: line = '' else: if soaflags[g_m]: line += '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]' else: line += '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' if maps[g_m] == OP_GBL: line += '&arg%d_l' % g_m if len(line): line += ';' code(line) code('') comm('inline function') indent = ' ' * (depth-2) inline_body_text = '' for line in body_text.split('\n'): if len(line): inline_body_text += indent+line+'\n' else: inline_body_text += '\n' code(inline_body_text) comm('end inline func') ENDFOR() code('') # end kernel function for g_m in range(0, nargs): if maps[g_m] == OP_GBL: code('*<ARG> = <ARG>_l;') depth -= 2; code('}') ########################################################################## # output individual omp4kernel file ########################################################################## fid = open('openmp4/'+name+'_omp4kernel_func.cpp','w') date = datetime.datetime.now() fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() # end of main kernel call loop ########################################################################## # output one master kernel file ########################################################################## file_text ='' comm(' header ') code('#include "op_lib_cpp.h" ') code('') comm(' user kernel files') for nk in range(0,len(kernels)): code('#include "'+kernels[nk]['name']+'_omp4kernel.cpp"') master = master.split('.')[0] fid = open('openmp4/'+master.split('.')[0]+'_omp4kernels.cpp','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close() ########################################################################## # output omp4 master kernel file ########################################################################## file_text ='' comm(' global constants ') for nc in range (0,len(consts)): if consts[nc]['dim']==1: code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel;') else: if consts[nc]['dim'] > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel['+num+'];') code('') comm(' header ') if os.path.exists('./user_types.h'): code('#include "../user_types.h"') code('#include "op_lib_cpp.h" ') code('') code('void op_decl_const_char(int dim, char const *type,') code(' int size, char *dat, char const *name){') indent = ' ' * ( 2+ depth) line = ' ' for nc in range (0,len(consts)): varname = consts[nc]['name'] if nc > 0: line += ' else ' line += 'if(!strcmp(name, "%s")) {\n' %varname + indent + 2*' ' + 'memcpy(' if consts[nc]['dim']==1: line += '&' line += varname+ '_ompkernel, dat, dim*size);\n' + indent + '#pragma omp target enter data map(to:'+varname+'_ompkernel' if consts[nc]['dim'] !=1: line += '[:%s]' % str(consts[nc]['dim']) if consts[nc]['dim'] > 0 else 'MAX_CONST_SIZE' line += ')\n'+indent + '}' code(line) code('}') comm(' user kernel files') for nk in range(0,len(kernels)): code('#include "'+kernels[nk]['name']+'_omp4kernel_func.cpp"') master = master.split('.')[0] fid = open('openmp4/'+master.split('.')[0]+'_omp4kernel_funcs.cpp','w') fid.write('//\n// auto-generated by op2.py\n//\n\n') fid.write(file_text) fid.close()