Exemplo n.º 1
0
        def _(iet):
            # Special symbol which gives user code control over data deallocations
            devicerm = DeviceRM()

            # Collect written and read-only symbols
            writes = set()
            reads = set()
            for i, v in MapExprStmts().visit(iet).items():
                if not i.is_Expression:
                    # No-op
                    continue
                if not any(
                        isinstance(j, self.lang.DeviceIteration) for j in v):
                    # Not an offloaded Iteration tree
                    continue
                if i.write.is_DiscreteFunction:
                    writes.add(i.write)
                reads.update({r for r in i.reads if r.is_DiscreteFunction})

            # Populate `storage`
            storage = Storage()
            for i in filter_sorted(writes):
                if is_on_device(i, self.gpu_fit):
                    self._map_function_on_high_bw_mem(iet, i, storage,
                                                      devicerm)
            for i in filter_sorted(reads - writes):
                if is_on_device(i, self.gpu_fit):
                    self._map_function_on_high_bw_mem(iet, i, storage,
                                                      devicerm, True)

            iet = self._dump_storage(iet, storage)

            return iet, {'args': devicerm}
Exemplo n.º 2
0
    def _is_offloadable(self, iet):
        expressions = FindNodes(Expression).visit(iet)
        if any(not is_on_device(e.write, self.gpu_fit) for e in expressions):
            return False

        functions = FindSymbols().visit(iet)
        buffers = [f for f in functions if f.is_Array and f._mem_mapped]
        hostfuncs = [f for f in functions if not is_on_device(f, self.gpu_fit)]
        return not (buffers and hostfuncs)
Exemplo n.º 3
0
    def _make_partree(self, candidates, nthreads=None):
        """
        Parallelize the `candidates` Iterations. In particular:

            * All parallel Iterations not *writing* to a host Function, that
              is a Function `f` such that `is_on_device(f) == False`, are offloaded
              to the device.
            * The remaining ones, that is those writing to a host Function,
              are parallelized on the host.
        """
        assert candidates
        root = candidates[0]

        if is_on_device(root, self.gpu_fit, only_writes=True):
            # The typical case: all written Functions are device Functions, that is
            # they're mapped in the device memory. Then we offload `root` to the device

            # Get the collapsable Iterations
            collapsable = self._find_collapsable(root, candidates)
            ncollapse = 1 + len(collapsable)

            body = self.DeviceIteration(gpu_fit=self.gpu_fit, ncollapse=ncollapse,
                                        **root.args)
            partree = ParallelTree([], body, nthreads=nthreads)

            return root, partree
        elif not self.par_disabled:
            # Resort to host parallelism
            return super()._make_partree(candidates, nthreads)
        else:
            return root, None
Exemplo n.º 4
0
    def _make_clauses(cls,
                      ncollapse=None,
                      reduction=None,
                      tile=None,
                      **kwargs):
        clauses = []

        if ncollapse:
            clauses.append('collapse(%d)' % (ncollapse or 1))
        elif tile:
            clauses.append('tile(%s)' % ','.join(str(i) for i in tile))

        if reduction:
            clauses.append(make_clause_reduction(reduction))

        indexeds = FindSymbols('indexeds').visit(kwargs['nodes'])
        deviceptrs = filter_ordered(i.name for i in indexeds
                                    if i.function._mem_local)
        presents = filter_ordered(i.name for i in indexeds if (
            is_on_device(i, kwargs['gpu_fit']) and i.name not in deviceptrs))

        # The NVC 20.7 and 20.9 compilers have a bug which triggers data movement for
        # indirectly indexed arrays (e.g., a[b[i]]) unless a present clause is used
        if presents:
            clauses.append("present(%s)" % ",".join(presents))

        if deviceptrs:
            clauses.append("deviceptr(%s)" % ",".join(deviceptrs))

        return clauses
Exemplo n.º 5
0
    def _make_clauses(cls, ncollapse=None, reduction=None, **kwargs):
        clauses = []

        clauses.append('collapse(%d)' % (ncollapse or 1))

        if reduction:
            clauses.append(make_clause_reduction(reduction))

        symbols = FindSymbols().visit(kwargs['nodes'])
        deviceptrs = [i.name for i in symbols if i.is_Array and i._mem_default]
        presents = [
            i.name for i in symbols
            if (i.is_AbstractFunction and is_on_device(i, kwargs['gpu_fit'])
                and i.name not in deviceptrs)
        ]

        # The NVC 20.7 and 20.9 compilers have a bug which triggers data movement for
        # indirectly indexed arrays (e.g., a[b[i]]) unless a present clause is used
        if presents:
            clauses.append("present(%s)" % ",".join(presents))

        if deviceptrs:
            clauses.append("deviceptr(%s)" % ",".join(deviceptrs))

        return clauses
Exemplo n.º 6
0
 def needs_transfer(f):
     return (isinstance(f, AbstractFunction)
             and is_on_device(f, self.gpu_fit) and f._mem_mapped)
Exemplo n.º 7
0
 def needs_transfer(f):
     return (is_on_device(f, self.gpu_fit)
             and isinstance(f,
                            (Array, Function, AbstractSparseFunction)))