def _make_fetchupdate(self, iet, sync_ops, pieces, *args): # Construct fetches postactions = [] for s in sync_ops: # The condition is already encoded in `iet` with a Conditional, # which stems from the originating Cluster's guards assert s.fcond is None imask = [(s.tstore, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] postactions.append( PragmaTransfer(self.lang._map_update_device, s.target, imask=imask)) # Turn init IET into a Callable functions = filter_ordered( flatten([(s.target, s.function) for s in sync_ops])) name = self.sregistry.make_name(prefix='init_device') body = List(body=iet.body + tuple(postactions)) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread iet = List(header=c.Comment("Initialize data stream"), body=Call(name, parameters)) return iet
def _make_withlock(self, iet, sync_ops, pieces, root): # Sorting for deterministic code gen locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name) # The `min` is used to pick the maximum possible degree of parallelism. # For example, assume there are two locks in the given `sync_ops`, `lock0(i)` # and `lock1(j)`. If, say, `lock0` protects 3 entries of a certain Function # `u`, while `lock1` protects 2 entries of the Function `v`, then there # will never be more than 2 threads in flight concurrently npthreads = min(i.size for i in locks) preactions = [BlankLine] for s in sync_ops: imask = [ s.handle.indices[d] if d.root in s.lock.locked_dimensions else FULL for d in s.target.dimensions ] update = PragmaTransfer(self.lang._map_update_host_async, s.target, imask=imask, queueid=SharedData._field_id) preactions.append(update) wait = self.lang._map_wait(SharedData._field_id) if wait is not None: preactions.append(Pragma(wait)) preactions.extend([DummyExpr(s.handle, 1) for s in sync_ops]) preactions.append(BlankLine) postactions = [BlankLine] postactions.extend([DummyExpr(s.handle, 2) for s in sync_ops]) # Turn `iet` into a ThreadFunction so that it can be executed # asynchronously by a pthread in the `npthreads` pool name = self.sregistry.make_name(prefix='copy_device_to_host') body = List(body=tuple(preactions) + iet.body + tuple(postactions)) tctx = make_thread_ctx(name, body, root, npthreads, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Schedule computation to the first available thread iet = tctx.activate # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, tctx.sdata, tctx.threads) return iet
def _make_delete(self, iet, sync_ops, *args): # Construct deletion clauses deletions = [] for s in sync_ops: dimensions = s.dimensions fc = s.fetch imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in dimensions] deletions.append( PragmaTransfer(self.lang._map_delete, s.function, imask=imask)) # Glue together the new IET pieces iet = List(header=c.Line(), body=[iet, BlankLine] + deletions) return iet
def _make_prefetchupdate(self, iet, sync_ops, pieces, root): fid = SharedData._field_id postactions = [BlankLine] for s in sync_ops: # `pcond` is not None, but we won't use it here because the condition # is actually already encoded in `iet` itself (it stems from the # originating Cluster's guards) assert s.pcond is not None imask = [(s.tstore, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] postactions.append( PragmaTransfer(self.lang._map_update_device_async, s.target, imask=imask, queueid=fid)) wait = self.lang._map_wait(fid) if wait is not None: postactions.append(Pragma(wait)) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(body=iet.body + tuple(postactions)) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # The IET degenerates to the threads activation logic iet = tctx.activate # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, tctx.sdata, tctx.threads) return iet
def _make_fetchprefetch(self, iet, sync_ops, pieces, root): fid = SharedData._field_id fetches = [] prefetches = [] presents = [] for s in sync_ops: f = s.function dimensions = s.dimensions fc = s.fetch ifc = s.ifetch pfc = s.pfetch fcond = s.fcond pcond = s.pcond # Construct init IET imask = [(ifc, s.size) if d.root is s.dim.root else FULL for d in dimensions] fetch = PragmaTransfer(self.lang._map_to, f, imask=imask) fetches.append(Conditional(fcond, fetch)) # Construct present clauses imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in dimensions] presents.append( PragmaTransfer(self.lang._map_present, f, imask=imask)) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in dimensions] prefetch = PragmaTransfer(self.lang._map_to_wait, f, imask=imask, queueid=fid) prefetches.append(Conditional(pcond, prefetch)) # Turn init IET into a Callable functions = filter_ordered(s.function for s in sync_ops) name = self.sregistry.make_name(prefix='init_device') body = List(body=fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append( List(header=c.Comment("Initialize data stream"), body=[Call(name, parameters), BlankLine])) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=prefetches) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Glue together all the IET pieces, including the activation logic sdata = tctx.sdata threads = tctx.threads iet = List(body=[ BlankLine, BusyWait( CondNe( FieldFromComposite(sdata._field_flag, sdata[ threads.index]), 1)) ] + presents + [iet, tctx.activate]) # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, sdata, threads) return iet