def _(iet): # Special symbol which gives user code control over data deallocations devicerm = DeviceRM() # Collect written and read-only symbols writes = set() reads = set() for i, v in MapExprStmts().visit(iet).items(): if not i.is_Expression: # No-op continue if not any( isinstance(j, self.lang.DeviceIteration) for j in v): # Not an offloaded Iteration tree continue if i.write.is_DiscreteFunction: writes.add(i.write) reads.update({r for r in i.reads if r.is_DiscreteFunction}) # Populate `storage` storage = Storage() for i in filter_sorted(writes): if is_on_device(i, self.gpu_fit): self._map_function_on_high_bw_mem(iet, i, storage, devicerm) for i in filter_sorted(reads - writes): if is_on_device(i, self.gpu_fit): self._map_function_on_high_bw_mem(iet, i, storage, devicerm, True) iet = self._dump_storage(iet, storage) return iet, {'args': devicerm}
def _is_offloadable(self, iet): expressions = FindNodes(Expression).visit(iet) if any(not is_on_device(e.write, self.gpu_fit) for e in expressions): return False functions = FindSymbols().visit(iet) buffers = [f for f in functions if f.is_Array and f._mem_mapped] hostfuncs = [f for f in functions if not is_on_device(f, self.gpu_fit)] return not (buffers and hostfuncs)
def _make_partree(self, candidates, nthreads=None): """ Parallelize the `candidates` Iterations. In particular: * All parallel Iterations not *writing* to a host Function, that is a Function `f` such that `is_on_device(f) == False`, are offloaded to the device. * The remaining ones, that is those writing to a host Function, are parallelized on the host. """ assert candidates root = candidates[0] if is_on_device(root, self.gpu_fit, only_writes=True): # The typical case: all written Functions are device Functions, that is # they're mapped in the device memory. Then we offload `root` to the device # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) body = self.DeviceIteration(gpu_fit=self.gpu_fit, ncollapse=ncollapse, **root.args) partree = ParallelTree([], body, nthreads=nthreads) return root, partree elif not self.par_disabled: # Resort to host parallelism return super()._make_partree(candidates, nthreads) else: return root, None
def _make_clauses(cls, ncollapse=None, reduction=None, tile=None, **kwargs): clauses = [] if ncollapse: clauses.append('collapse(%d)' % (ncollapse or 1)) elif tile: clauses.append('tile(%s)' % ','.join(str(i) for i in tile)) if reduction: clauses.append(make_clause_reduction(reduction)) indexeds = FindSymbols('indexeds').visit(kwargs['nodes']) deviceptrs = filter_ordered(i.name for i in indexeds if i.function._mem_local) presents = filter_ordered(i.name for i in indexeds if ( is_on_device(i, kwargs['gpu_fit']) and i.name not in deviceptrs)) # The NVC 20.7 and 20.9 compilers have a bug which triggers data movement for # indirectly indexed arrays (e.g., a[b[i]]) unless a present clause is used if presents: clauses.append("present(%s)" % ",".join(presents)) if deviceptrs: clauses.append("deviceptr(%s)" % ",".join(deviceptrs)) return clauses
def _make_clauses(cls, ncollapse=None, reduction=None, **kwargs): clauses = [] clauses.append('collapse(%d)' % (ncollapse or 1)) if reduction: clauses.append(make_clause_reduction(reduction)) symbols = FindSymbols().visit(kwargs['nodes']) deviceptrs = [i.name for i in symbols if i.is_Array and i._mem_default] presents = [ i.name for i in symbols if (i.is_AbstractFunction and is_on_device(i, kwargs['gpu_fit']) and i.name not in deviceptrs) ] # The NVC 20.7 and 20.9 compilers have a bug which triggers data movement for # indirectly indexed arrays (e.g., a[b[i]]) unless a present clause is used if presents: clauses.append("present(%s)" % ",".join(presents)) if deviceptrs: clauses.append("deviceptr(%s)" % ",".join(deviceptrs)) return clauses
def needs_transfer(f): return (isinstance(f, AbstractFunction) and is_on_device(f, self.gpu_fit) and f._mem_mapped)
def needs_transfer(f): return (is_on_device(f, self.gpu_fit) and isinstance(f, (Array, Function, AbstractSparseFunction)))