def replace_param(param): param = symbolic.symstr(param) for p, pval in param_to_edge.items(): # TODO: This special replacement condition will be removed # when the code generator is modified to make consistent # scalar/array decisions. if (isinstance(nsdfg.arrays[pval.data.data], data.Scalar) or (nsdfg.arrays[pval.data.data].shape[0] == 1 and len(nsdfg.arrays[pval.data.data].shape) == 1)): param = param.replace(p, pval.data.data) else: param = param.replace(p, cpp_array_expr(nsdfg, pval.data)) return param
def replace_param(param): param = symbolic.symstr(param) for p, pval in param_to_edge.items(): # TODO: Correct w.r.t. connector type param = param.replace(p, cpp_array_expr(nsdfg, pval.data)) return param
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() localcode = CodeIOStream() # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) # Obtain some SDFG-related information input_memlet = input_edge.data output_memlet = output_edge.data if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Try to obtain the number of threads in the block, or use the default # configuration block_threads = devicelevel_block_size(sdfg, state, node) if block_threads is not None: block_threads = functools.reduce(lambda a, b: a * b, block_threads, 1) # Checks if block_threads is None: raise ValueError('Block-wide GPU reduction must occur within' ' a GPU kernel') if issymbolic(block_threads, sdfg.constants): raise ValueError('Block size has to be constant for block-wide ' 'reduction (got %s)' % str(block_threads)) if (node.axes is not None and len(node.axes) < input_dims): raise ValueError( 'Only full reduction is supported for block-wide reduce,' ' please use the pure expansion') if (input_data.storage != dtypes.StorageType.Register or output_data.storage != dtypes.StorageType.Register): raise ValueError( 'Block-wise reduction only supports GPU register inputs ' 'and outputs') if redtype in ExpandReduceCUDABlock._SPECIAL_RTYPES: raise ValueError('%s block reduction not supported' % redtype) credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] if redtype == dtypes.ReductionType.Custom: redop = '__reduce_%s()' % idstr else: redop = 'dace::_wcr_fixed<%s, %s>()' % (credtype, output_type) # Allocate shared memory for block reduce localcode.write(""" typedef cub::BlockReduce<{type}, {numthreads}> BlockReduce_{id}; __shared__ typename BlockReduce_{id}::TempStorage temp_storage_{id}; """.format(id=idstr, type=output_data.dtype.ctype, numthreads=block_threads)) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = cpp_array_expr(sdfg, output_memlet) localcode.write(""" {output} = BlockReduce_{id}(temp_storage_{id}).Reduce({input}, {redop}); """.format(id=idstr, redop=redop, input=input_memlet.data, output=output)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) output_dims = len(output_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() cuda_initcode = CodeIOStream() cuda_exitcode = CodeIOStream() host_globalcode = CodeIOStream() host_localcode = CodeIOStream() output_memlet = output_edge.data # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Obtain some SDFG-related information input_memlet = input_edge.data reduce_shape = input_memlet.subset.bounding_box_size() num_items = ' * '.join(symstr(s) for s in reduce_shape) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = (output_memlet.data + ' + ' + cpp_array_expr(sdfg, output_memlet, with_brackets=False)) input_dims = input_memlet.subset.dims() output_dims = output_memlet.subset.data_dims() reduce_all_axes = (node.axes is None or len(node.axes) == input_dims) if reduce_all_axes: reduce_last_axes = False else: reduce_last_axes = sorted(node.axes) == list( range(input_dims - len(node.axes), input_dims)) if (not reduce_all_axes) and (not reduce_last_axes): raise NotImplementedError( 'Multiple axis reductions not supported on GPUs. Please use ' 'the pure expansion or make reduce axes the last in the array.' ) # Verify that data is on the GPU if input_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Input of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') if output_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Output of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') # Determine reduction type kname = (ExpandReduceCUDADevice._SPECIAL_RTYPES[redtype] if redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES else 'Reduce') # Create temp memory for this GPU cuda_globalcode.write( """ void *__cub_storage_{sdfg}_{state}_{node} = NULL; size_t __cub_ssize_{sdfg}_{state}_{node} = 0; """.format(sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) if reduce_all_axes: reduce_type = 'DeviceReduce' reduce_range = num_items reduce_range_def = 'size_t num_items' reduce_range_use = 'num_items' reduce_range_call = num_items elif reduce_last_axes: num_reduce_axes = len(node.axes) not_reduce_axes = reduce_shape[:-num_reduce_axes] reduce_axes = reduce_shape[-num_reduce_axes:] num_segments = ' * '.join([symstr(s) for s in not_reduce_axes]) segment_size = ' * '.join([symstr(s) for s in reduce_axes]) reduce_type = 'DeviceSegmentedReduce' iterator = 'dace::stridedIterator({size})'.format( size=segment_size) reduce_range = '{num}, {it}, {it} + 1'.format(num=num_segments, it=iterator) reduce_range_def = 'size_t num_segments, size_t segment_size' iterator_use = 'dace::stridedIterator(segment_size)' reduce_range_use = 'num_segments, {it}, {it} + 1'.format( it=iterator_use) reduce_range_call = '%s, %s' % (num_segments, segment_size) # Call CUB to get the storage size, allocate and free it cuda_initcode.write( """ cub::{reduce_type}::{kname}(nullptr, __cub_ssize_{sdfg}_{state}_{node}, ({intype}*)nullptr, ({outtype}*)nullptr, {reduce_range}{redop}); cudaMalloc(&__cub_storage_{sdfg}_{state}_{node}, __cub_ssize_{sdfg}_{state}_{node}); """.format(sdfg=sdfg.name, state=state_id, node=node_id, reduce_type=reduce_type, reduce_range=reduce_range, redop=reduce_op, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, kname=kname), sdfg, state_id, node) cuda_exitcode.write( 'cudaFree(__cub_storage_{sdfg}_{state}_{node});'.format( sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) # Write reduction function definition cuda_globalcode.write(""" DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream) {{ cub::{reduce_type}::{kname}(__cub_storage_{id}, __cub_ssize_{id}, input, output, {reduce_range_use}{redop}, stream); }} """.format(id=idstr, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, reduce_type=reduce_type, reduce_range_def=reduce_range_def, reduce_range_use=reduce_range_use, kname=kname, redop=reduce_op)) # Write reduction function definition in caller file host_globalcode.write( """ DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); """.format(id=idstr, reduce_range_def=reduce_range_def, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype), sdfg, state_id, node) # Call reduction function where necessary host_localcode.write( '__dace_reduce_{id}({input}, {output}, {reduce_range_call}, __dace_current_stream);' .format(id=idstr, input=input, output=output, reduce_range_call=reduce_range_call)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, host_localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(host_globalcode.getvalue()) sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') sdfg.append_init_code(cuda_initcode.getvalue(), 'cuda') sdfg.append_exit_code(cuda_exitcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def copy_memory(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge, function_stream: CodeIOStream, callsite_stream: CodeIOStream): # Obtain source and destination information, handle access<->tasklet # If copying from tensor core fragments to/from tasklets, we only need # to emit a reference, as the fragment contains the memory. src_desc = (src_node.desc(sdfg) if isinstance(src_node, nodes.AccessNode) else None) # Tasklet -> Array if not src_desc: local_name = dfg.memlet_path(edge)[0].src_conn callsite_stream.write( 'auto& %s = %s;' % (local_name, dst_node.data), sdfg, state_id, [src_node, dst_node]) return dst_desc = (dst_node.desc(sdfg) if isinstance(dst_node, nodes.AccessNode) else None) # Array -> Tasklet if not dst_desc: local_name = dfg.memlet_path(edge)[-1].dst_conn callsite_stream.write( 'auto& %s = %s;' % (local_name, src_node.data), sdfg, state_id, [src_node, dst_node]) return nontc_desc = (dst_desc if 'TensorCore' in src_desc.storage.name else src_desc) nontc_node = (dst_node if 'TensorCore' in src_desc.storage.name else src_node) # Majority is detected by the strides of the data row_major = True if nontc_desc.strides[-1] == 1 else False ##################################################################### # Set non-tensor-core C++ expression based on memlet if edge.data.data == nontc_node.data: other_expr = cpp_array_expr(sdfg, edge.data) elif edge.data.other_subset is not None: offset_cppstr = cpp_offset_expr(nontc_desc, edge.data.other_subset) other_expr = '%s[%s]' % (nontc_node.data, offset_cppstr) else: other_expr = '%s[0]' % nontc_node.data ##################################################################### # Emit copy code if 'TensorCore' in dst_desc.storage.name: # GPU memory to Tensor Cores callsite_stream.write( 'wmma::load_matrix_sync({tc}, &{other}, ' '{stride});'.format( tc=dst_node.data, other=other_expr, stride=src_desc.strides[0 if row_major else 1]), sdfg, state_id, [src_node, dst_node]) else: # Tensor Cores to GPU memory callsite_stream.write( 'wmma::store_matrix_sync(&{other}, {tc}, ' '{stride}, wmma::mem_{maj}_major);'.format( tc=src_node.data, other=other_expr, maj='row' if row_major else 'col', stride=dst_desc.strides[0 if row_major else 1]), sdfg, state_id, [src_node, dst_node])
def memlet_view_ctor(self, sdfg, memlet, direction): useskip = False memlet_params = [] memlet_name = memlet.data if isinstance(sdfg.arrays[memlet.data], data.Scalar): raise ValueError("This should never have happened") if isinstance(memlet.subset, subsets.Indices): # Compute address memlet_params.append(cpp_array_expr(sdfg, memlet, False)) dims = 0 elif isinstance(memlet.subset, subsets.Range): dims = len(memlet.subset.ranges) #memlet_params.append("") # Dimensions to remove from view (due to having one value) indexdims = [] nonIndexDims = [] for dim, (rb, re, rs) in enumerate(memlet.subset.ranges): if rs != 1: useskip = True try: if (re - rb) == 0: indexdims.append(dim) else: nonIndexDims.append(dim) except TypeError: # cannot determine truth value of Relational nonIndexDims.append(dim) if len(nonIndexDims) > 1 and len(indexdims) > 0: raise NotImplementedError( 'subviews of more than one dimension ' + 'not implemented') elif len( nonIndexDims) == 1 and len(indexdims) > 0: # One dimension indexdim = nonIndexDims[0] # Contiguous dimension if indexdim == dims - 1: memlet_params[-1] += ' + %s' % cpp_array_expr( sdfg, memlet, False) memlet_params.append( '0, %s' % (sym2cpp(memlet.subset.ranges[-1][1] - memlet.subset.ranges[-1][0]))) else: # Non-contiguous dimension useskip = True memlet_params[-1] += ' + %s' % cpp_array_expr( sdfg, memlet, False) memlet_range = memlet.subset.ranges[indexdim] memlet_stride = sdfg.arrays[memlet.data].strides[indexdim] memlet_stride = sym2cpp(memlet_stride) memlet_params.append( '0, %s, %s' % (sym2cpp(memlet_range[1] - memlet_range[0]), sym2cpp(memlet_stride))) # Subtract index dimensions from array dimensions dims -= len(indexdims) elif len(indexdims) == 0: for (rb, re, rs), s in zip(memlet.subset.ranges, sdfg.arrays[memlet.data].shape): if useskip: memlet_params.append( '%s, %s, %s' % (cppunparse.pyexpr2cpp(symbolic.symstr(rb)), cppunparse.pyexpr2cpp(symbolic.symstr(s)), cppunparse.pyexpr2cpp(symbolic.symstr(rs)))) else: memlet_params.append( '%s, %s' % (cppunparse.pyexpr2cpp(symbolic.symstr(rb)), cppunparse.pyexpr2cpp(symbolic.symstr(s)))) elif len(nonIndexDims) == 0: # Scalar view if len(memlet_params) > 0: # Compute address memlet_params[-1] += ' + ' + cpp_array_expr( sdfg, memlet, False) else: memlet_params.append(cpp_array_expr(sdfg, memlet, False)) dims = 0 else: raise RuntimeError('Memlet type "%s" not implemented' % memlet.subset) if dims == 0: return 'dace::ArrayViewImmaterial%s%s<%s, %s, int32_t> ("%s", %s)' % ( 'In' if direction == "in" else "Out", 'Skip' if useskip else '', sdfg.arrays[memlet.data].dtype.ctype, symbolic.symstr( memlet.veclen), memlet.data, ', '.join(memlet_params)) else: return 'dace::ArrayViewImmaterial%s%s<%s, %s, int32_t, %s> ("%s", %s)' % ( 'In' if direction == "in" else "Out", 'Skip' if useskip else '', sdfg.arrays[memlet.data].dtype.ctype, symbolic.symstr(memlet.veclen), ', '.join([ str(s) for s in memlet.subset.bounding_box_size() ]), memlet.data, ', '.join(memlet_params))