Exemplo n.º 1
0
def autotune(operator, args, level, mode):
    """
    Operator autotuning.

    Parameters
    ----------
    operator : Operator
        Input Operator.
    args : dict_like
        The runtime arguments with which `operator` is run.
    level : str
        The autotuning aggressiveness (basic, aggressive). A more aggressive
        autotuning might eventually result in higher performance, though in
        some circumstances it might instead increase the actual runtime.
    mode : str
        The autotuning mode (preemptive, runtime). In preemptive mode, the
        output runtime values supplied by the user to `operator.apply` are
        replaced with shadow copies.
    """
    key = [level, mode]
    accepted = configuration._accepted['autotuning']
    if key not in accepted:
        raise ValueError("The accepted `(level, mode)` combinations are `%s`; "
                         "provided `%s` instead" % (accepted, key))

    # Tunable objects
    blockable = [
        i for i in operator.dimensions if isinstance(i, BlockDimension)
    ]
    nthreads = [i for i in operator.input if isinstance(i, NThreads)]

    if len(nthreads + blockable) == 0:
        # Nothing to tune for
        return args, {}

    # We get passed all the arguments, but the cfunction only requires a subset
    at_args = OrderedDict([(p.name, args[p.name])
                           for p in operator.parameters])

    # User-provided output data won't be altered in `preemptive` mode
    if mode == 'preemptive':
        output = {i.name: i for i in operator.output}
        copies = {
            k: output[k]._C_as_ndarray(v).copy()
            for k, v in args.items() if k in output
        }
        # WARNING: `copies` keeps references to numpy arrays, which is required
        # to avoid garbage collection to kick in during autotuning and prematurely
        # free the shadow copies handed over to C-land
        at_args.update(
            {k: output[k]._C_make_dataobj(v)
             for k, v in copies.items()})

    # Disable halo exchanges as the number of autotuning steps performed on each
    # rank may be different. Also, this makes the autotuning runtimes reliable
    # regardless of whether the timed regions include the halo exchanges or not,
    # as now the halo exchanges become a no-op.
    try:
        nb = []
        if mode != 'runtime':
            for i, _ in at_args['nb']._obj._fields_:
                nb.append((i, getattr(at_args['nb']._obj, i)))
                setattr(at_args['nb']._obj, i, MPI.PROC_NULL)
    except KeyError:
        assert not configuration['mpi']

    roots = [operator.body] + [i.root for i in operator._func_table.values()]
    trees = retrieve_iteration_tree(roots)

    # Shrink the time dimension's iteration range for quick autotuning
    steppers = {i for i in flatten(trees) if i.dim.is_Time}
    if len(steppers) == 0:
        stepper = None
        timesteps = 1
    elif len(steppers) == 1:
        stepper = steppers.pop()
        timesteps = init_time_bounds(stepper, at_args)
        if timesteps is None:
            return args, {}
    else:
        warning(
            "cannot perform autotuning unless there is one time loop; skipping"
        )
        return args, {}

    # Formula to calculate the number of parallel blocks given block shape,
    # number of threads, and size of the parallel iteration space
    calculate_parblocks = make_calculate_parblocks(trees, blockable, nthreads)

    # Generated loop-blocking attempts
    block_shapes = generate_block_shapes(blockable, args, level)

    # Generate nthreads attempts
    nthreads = generate_nthreads(nthreads, args, level)

    generators = [i for i in [block_shapes, nthreads] if i]

    timings = OrderedDict()
    for i in product(*generators):
        run = tuple(chain(*i))
        mapper = OrderedDict(run)

        # Can we safely autotune over the given time range?
        if not check_time_bounds(stepper, at_args, args, mode):
            break

        # Update `at_args` to use the new tunable values
        at_args = {k: mapper.get(k, v) for k, v in at_args.items()}

        if heuristically_discard_run(calculate_parblocks, at_args):
            continue

        # Make sure we remain within stack bounds, otherwise skip run
        try:
            stack_footprint = operator._mem_summary['stack']
            if int(evaluate(stack_footprint, **
                            at_args)) > options['stack_limit']:
                continue
        except TypeError:
            warning("couldn't determine stack size; skipping run %s" % str(i))
            continue
        except AttributeError:
            assert stack_footprint == 0

        # Use fresh profiling data
        timer = operator._profiler.timer.reset()
        at_args[operator._profiler.name] = timer

        operator.cfunction(*list(at_args.values()))
        elapsed = sum(getattr(timer._obj, k) for k, _ in timer._obj._fields_)
        timings[run] = elapsed
        log("run <%s> took %f (s) in %d timesteps" %
            (','.join('%s=%s' % (k, v)
                      for k, v in mapper.items()), elapsed, timesteps))

        # Prepare for the next autotuning run
        update_time_bounds(stepper, at_args, timesteps, mode)

    try:
        best = dict(min(timings, key=timings.get))
        log("selected best: %s" % best)
    except ValueError:
        warning("couldn't perform any runs")
        return args, {}

    # Build the new argument list
    args = {k: best.get(k, v) for k, v in args.items()}

    # In `runtime` mode, some timesteps have been executed already, so we
    # get to adjust the time range
    finalize_time_bounds(stepper, at_args, args, mode)

    # Reset profiling data
    assert operator._profiler.name in args
    args[operator._profiler.name] = operator._profiler.timer.reset()

    # Reinstate MPI neighbourhood
    for i, v in nb:
        setattr(args['nb']._obj, i, v)

    # Autotuning summary
    summary = {}
    summary['runs'] = len(timings)
    summary['tpr'] = timesteps  # tpr -> timesteps per run
    summary['tuned'] = dict(best)

    return args, summary
Exemplo n.º 2
0
def autotune(operator, args, level, mode):
    """
    Operator autotuning.

    Parameters
    ----------
    operator : Operator
        Input Operator.
    args : dict_like
        The runtime arguments with which `operator` is run.
    level : str
        The autotuning aggressiveness (basic, aggressive, max). A more
        aggressive autotuning might eventually result in higher runtime
        performance, but the autotuning phase will take longer.
    mode : str
        The autotuning mode (preemptive, runtime). In preemptive mode, the
        output runtime values supplied by the user to `operator.apply` are
        replaced with shadow copies.
    """
    key = [level, mode]
    accepted = configuration._accepted['autotuning']
    if key not in accepted:
        raise ValueError("The accepted `(level, mode)` combinations are `%s`; "
                         "provided `%s` instead" % (accepted, key))

    # We get passed all the arguments, but the cfunction only requires a subset
    at_args = OrderedDict([(p.name, args[p.name]) for p in operator.parameters])

    # User-provided output data won't be altered in `preemptive` mode
    if mode == 'preemptive':
        output = {i.name: i for i in operator.output}
        copies = {k: output[k]._C_as_ndarray(v).copy()
                  for k, v in args.items() if k in output}
        # WARNING: `copies` keeps references to numpy arrays, which is required
        # to avoid garbage collection to kick in during autotuning and prematurely
        # free the shadow copies handed over to C-land
        at_args.update({k: output[k]._C_make_dataobj(v) for k, v in copies.items()})

    # Disable halo exchanges through MPI_PROC_NULL
    if mode in ['preemptive', 'destructive']:
        for p in operator.parameters:
            if isinstance(p, MPINeighborhood):
                at_args.update(MPINeighborhood(p.neighborhood)._arg_values())
                for i in p.fields:
                    setattr(at_args[p.name]._obj, i, MPI.PROC_NULL)
            elif isinstance(p, MPIMsgEnriched):
                at_args.update(MPIMsgEnriched(p.name, p.function, p.halos)._arg_values())
                for i in at_args[p.name]:
                    i.fromrank = MPI.PROC_NULL
                    i.torank = MPI.PROC_NULL

    roots = [operator.body] + [i.root for i in operator._func_table.values()]
    trees = filter_ordered(retrieve_iteration_tree(roots), key=lambda i: i.root)

    # Detect the time-stepping Iteration; shrink its iteration range so that
    # each autotuning run only takes a few iterations
    steppers = {i for i in flatten(trees) if i.dim.is_Time}
    if len(steppers) == 0:
        stepper = None
        timesteps = 1
    elif len(steppers) == 1:
        stepper = steppers.pop()
        timesteps = init_time_bounds(stepper, at_args, args)
        if timesteps is None:
            return args, {}
    else:
        warning("cannot perform autotuning unless there is one time loop; skipping")
        return args, {}

    # Perform autotuning
    timings = {}
    for n, tree in enumerate(trees):
        blockable = [i.dim for i in tree if isinstance(i.dim, BlockDimension)]

        # Tunable arguments
        try:
            tunable = []
            tunable.append(generate_block_shapes(blockable, args, level))
            tunable.append(generate_nthreads(operator.nthreads, args, level))
            tunable = list(product(*tunable))
        except ValueError:
            # Some arguments are compulsory, otherwise autotuning is skipped
            continue

        # Symbolic number of loop-blocking blocks per thread
        nblocks_per_thread = calculate_nblocks(tree, blockable) / operator.nthreads

        for bs, nt in tunable:
            # Can we safely autotune over the given time range?
            if not check_time_bounds(stepper, at_args, args, mode):
                break

            # Update `at_args` to use the new tunable arguments
            run = [(k, v) for k, v in bs + nt if k in at_args]
            at_args.update(dict(run))

            # Drop run if not at least one block per thread
            if not configuration['develop-mode'] and nblocks_per_thread.subs(at_args) < 1:
                continue

            # Make sure we remain within stack bounds, otherwise skip run
            try:
                stack_footprint = operator._mem_summary['stack']
                if int(evaluate(stack_footprint, **at_args)) > options['stack_limit']:
                    continue
            except TypeError:
                warning("couldn't determine stack size; skipping run %s" % str(i))
                continue
            except AttributeError:
                assert stack_footprint == 0

            # Run the Operator
            operator.cfunction(*list(at_args.values()))
            elapsed = operator._profiler.timer.total

            timings.setdefault(nt, OrderedDict()).setdefault(n, {})[bs] = elapsed
            log("run <%s> took %f (s) in %d timesteps" %
                (','.join('%s=%s' % i for i in run), elapsed, timesteps))

            # Prepare for the next autotuning run
            update_time_bounds(stepper, at_args, timesteps, mode)

            # Reset profiling timers
            operator._profiler.timer.reset()

    # The best variant is the one that for a given number of threads had the minium
    # turnaround time
    try:
        runs = 0
        mapper = {}
        for k, v in timings.items():
            for i in v.values():
                runs += len(i)
                record = mapper.setdefault(k, Record())
                record.add(min(i, key=i.get), min(i.values()))
        best = min(mapper, key=mapper.get)
        best = OrderedDict(best + tuple(mapper[best].args))
        best.pop(None, None)
        log("selected <%s>" % (','.join('%s=%s' % i for i in best.items())))
    except ValueError:
        warning("couldn't perform any runs")
        return args, {}

    # Update the argument list with the tuned arguments
    args.update(best)

    # In `runtime` mode, some timesteps have been executed already, so we must
    # adjust the time range
    finalize_time_bounds(stepper, at_args, args, mode)

    # Autotuning summary
    summary = {}
    summary['runs'] = runs
    summary['tpr'] = timesteps  # tpr -> timesteps per run
    summary['tuned'] = dict(best)

    return args, summary
Exemplo n.º 3
0
def autotune(operator, args, level, mode):
    """
    Operator autotuning.

    Parameters
    ----------
    operator : Operator
        Input Operator.
    args : dict_like
        The runtime arguments with which `operator` is run.
    level : str
        The autotuning aggressiveness (basic, aggressive, max). A more
        aggressive autotuning might eventually result in higher runtime
        performance, but the autotuning phase will take longer.
    mode : str
        The autotuning mode (preemptive, runtime). In preemptive mode, the
        output runtime values supplied by the user to `operator.apply` are
        replaced with shadow copies.
    """
    key = [level, mode]
    accepted = configuration._accepted['autotuning']
    if key not in accepted:
        raise ValueError("The accepted `(level, mode)` combinations are `%s`; "
                         "provided `%s` instead" % (accepted, key))

    # We get passed all the arguments, but the cfunction only requires a subset
    at_args = OrderedDict([(p.name, args[p.name]) for p in operator.parameters])

    # User-provided output data won't be altered in `preemptive` mode
    if mode == 'preemptive':
        output = {i.name: i for i in operator.output}
        copies = {k: output[k]._C_as_ndarray(v).copy()
                  for k, v in args.items() if k in output}
        # WARNING: `copies` keeps references to numpy arrays, which is required
        # to avoid garbage collection to kick in during autotuning and prematurely
        # free the shadow copies handed over to C-land
        at_args.update({k: output[k]._C_make_dataobj(v) for k, v in copies.items()})

    # Disable halo exchanges through MPI_PROC_NULL
    if mode in ['preemptive', 'destructive']:
        for p in operator.parameters:
            if isinstance(p, MPINeighborhood):
                at_args.update(MPINeighborhood(p.fields)._arg_values())
                for i in p.fields:
                    setattr(at_args[p.name]._obj, i, MPI.PROC_NULL)
            elif isinstance(p, MPIMsgEnriched):
                at_args.update(MPIMsgEnriched(p.name, p.function, p.halos)._arg_values())
                for i in at_args[p.name]:
                    i.fromrank = MPI.PROC_NULL
                    i.torank = MPI.PROC_NULL

    roots = [operator.body] + [i.root for i in operator._func_table.values()]
    trees = filter_ordered(retrieve_iteration_tree(roots), key=lambda i: i.root)

    # Detect the time-stepping Iteration; shrink its iteration range so that
    # each autotuning run only takes a few iterations
    steppers = {i for i in flatten(trees) if i.dim.is_Time}
    if len(steppers) == 0:
        stepper = None
        timesteps = 1
    elif len(steppers) == 1:
        stepper = steppers.pop()
        timesteps = init_time_bounds(stepper, at_args)
        if timesteps is None:
            return args, {}
    else:
        warning("cannot perform autotuning unless there is one time loop; skipping")
        return args, {}

    # Perform autotuning
    timings = {}
    for n, tree in enumerate(trees):
        blockable = [i.dim for i in tree if isinstance(i.dim, BlockDimension)]

        # Tunable arguments
        try:
            tunable = []
            tunable.append(generate_block_shapes(blockable, args, level))
            tunable.append(generate_nthreads(operator.nthreads, args, level))
            tunable = list(product(*tunable))
        except ValueError:
            # Some arguments are cumpolsory, otherwise autotuning is skipped
            continue

        # Symbolic number of loop-blocking blocks per thread
        nblocks_per_thread = calculate_nblocks(tree, blockable) / operator.nthreads

        for bs, nt in tunable:
            # Can we safely autotune over the given time range?
            if not check_time_bounds(stepper, at_args, args, mode):
                break

            # Update `at_args` to use the new tunable arguments
            run = [(k, v) for k, v in bs + nt if k in at_args]
            at_args.update(dict(run))

            # Drop run if not at least one block per thread
            if not configuration['develop-mode'] and nblocks_per_thread.subs(at_args) < 1:
                continue

            # Make sure we remain within stack bounds, otherwise skip run
            try:
                stack_footprint = operator._mem_summary['stack']
                if int(evaluate(stack_footprint, **at_args)) > options['stack_limit']:
                    continue
            except TypeError:
                warning("couldn't determine stack size; skipping run %s" % str(i))
                continue
            except AttributeError:
                assert stack_footprint == 0

            # Run the Operator
            operator.cfunction(*list(at_args.values()))
            elapsed = operator._profiler.timer.total

            timings.setdefault(nt, OrderedDict()).setdefault(n, {})[bs] = elapsed
            log("run <%s> took %f (s) in %d timesteps" %
                (','.join('%s=%s' % i for i in run), elapsed, timesteps))

            # Prepare for the next autotuning run
            update_time_bounds(stepper, at_args, timesteps, mode)

            # Reset profiling timers
            operator._profiler.timer.reset()

    # The best variant is the one that for a given number of threads had the minium
    # turnaround time
    try:
        runs = 0
        mapper = {}
        for k, v in timings.items():
            for i in v.values():
                runs += len(i)
                record = mapper.setdefault(k, Record())
                record.add(min(i, key=i.get), min(i.values()))
        best = min(mapper, key=mapper.get)
        best = OrderedDict(best + tuple(mapper[best].args))
        best.pop(None, None)
        log("selected <%s>" % (','.join('%s=%s' % i for i in best.items())))
    except ValueError:
        warning("couldn't perform any runs")
        return args, {}

    # Update the argument list with the tuned arguments
    args.update(best)

    # In `runtime` mode, some timesteps have been executed already, so we must
    # adjust the time range
    finalize_time_bounds(stepper, at_args, args, mode)

    # Autotuning summary
    summary = {}
    summary['runs'] = runs
    summary['tpr'] = timesteps  # tpr -> timesteps per run
    summary['tuned'] = dict(best)

    return args, summary