Exemplo n.º 1
0
def test_mpi_fullmode_objects():
    grid = Grid(shape=(4, 4, 4))
    x, y, _ = grid.dimensions

    # Message
    f = Function(name='f', grid=grid)
    obj = MPIMsgEnriched('msg', f, [Halo(x, LEFT)])
    pkl_obj = pickle.dumps(obj)
    new_obj = pickle.loads(pkl_obj)
    assert obj.name == new_obj.name
    assert obj.target.name == new_obj.target.name
    assert all(
        obj.target.dimensions[i].name == new_obj.target.dimensions[i].name
        for i in range(grid.dim))
    assert new_obj.target.dimensions[0] is new_obj.halos[0].dim

    # Region
    x_m, x_M = x.symbolic_min, x.symbolic_max
    y_m, y_M = y.symbolic_min, y.symbolic_max
    obj = MPIRegion('reg', 1, [y, x], [(((x, OWNED, LEFT), ), {
        x: (x_m, Min(x_M, x_m))
    }), (((y, OWNED, LEFT), ), {
        y: (y_m, Min(y_M, y_m))
    })])
    pkl_obj = pickle.dumps(obj)
    new_obj = pickle.loads(pkl_obj)
    assert obj.prefix == new_obj.prefix
    assert obj.key == new_obj.key
    assert obj.name == new_obj.name
    assert len(new_obj.arguments) == 2
    assert all(d0.name == d1.name
               for d0, d1 in zip(obj.arguments, new_obj.arguments))
    assert all(new_obj.arguments[i] is new_obj.owned[i][0][0][0]  # `x` and `y`
               for i in range(2))
    assert new_obj.owned[0][0][0][1] is new_obj.owned[1][0][0][1]  # `OWNED`
    assert new_obj.owned[0][0][0][2] is new_obj.owned[1][0][0][2]  # `LEFT`
    for n, i in enumerate(new_obj.owned):
        d, v = list(i[1].items())[0]
        assert d is new_obj.arguments[n]
        assert v[0] is d.symbolic_min
        assert v[1] == Min(d.symbolic_max, d.symbolic_min)
Exemplo n.º 2
0
def autotune(operator, args, level, mode):
    """
    Operator autotuning.

    Parameters
    ----------
    operator : Operator
        Input Operator.
    args : dict_like
        The runtime arguments with which `operator` is run.
    level : str
        The autotuning aggressiveness (basic, aggressive, max). A more
        aggressive autotuning might eventually result in higher runtime
        performance, but the autotuning phase will take longer.
    mode : str
        The autotuning mode (preemptive, runtime). In preemptive mode, the
        output runtime values supplied by the user to `operator.apply` are
        replaced with shadow copies.
    """
    key = [level, mode]
    accepted = configuration._accepted['autotuning']
    if key not in accepted:
        raise ValueError("The accepted `(level, mode)` combinations are `%s`; "
                         "provided `%s` instead" % (accepted, key))

    # We get passed all the arguments, but the cfunction only requires a subset
    at_args = OrderedDict([(p.name, args[p.name]) for p in operator.parameters])

    # User-provided output data won't be altered in `preemptive` mode
    if mode == 'preemptive':
        output = {i.name: i for i in operator.output}
        copies = {k: output[k]._C_as_ndarray(v).copy()
                  for k, v in args.items() if k in output}
        # WARNING: `copies` keeps references to numpy arrays, which is required
        # to avoid garbage collection to kick in during autotuning and prematurely
        # free the shadow copies handed over to C-land
        at_args.update({k: output[k]._C_make_dataobj(v) for k, v in copies.items()})

    # Disable halo exchanges through MPI_PROC_NULL
    if mode in ['preemptive', 'destructive']:
        for p in operator.parameters:
            if isinstance(p, MPINeighborhood):
                at_args.update(MPINeighborhood(p.neighborhood)._arg_values())
                for i in p.fields:
                    setattr(at_args[p.name]._obj, i, MPI.PROC_NULL)
            elif isinstance(p, MPIMsgEnriched):
                at_args.update(MPIMsgEnriched(p.name, p.function, p.halos)._arg_values())
                for i in at_args[p.name]:
                    i.fromrank = MPI.PROC_NULL
                    i.torank = MPI.PROC_NULL

    roots = [operator.body] + [i.root for i in operator._func_table.values()]
    trees = filter_ordered(retrieve_iteration_tree(roots), key=lambda i: i.root)

    # Detect the time-stepping Iteration; shrink its iteration range so that
    # each autotuning run only takes a few iterations
    steppers = {i for i in flatten(trees) if i.dim.is_Time}
    if len(steppers) == 0:
        stepper = None
        timesteps = 1
    elif len(steppers) == 1:
        stepper = steppers.pop()
        timesteps = init_time_bounds(stepper, at_args, args)
        if timesteps is None:
            return args, {}
    else:
        warning("cannot perform autotuning unless there is one time loop; skipping")
        return args, {}

    # Perform autotuning
    timings = {}
    for n, tree in enumerate(trees):
        blockable = [i.dim for i in tree if isinstance(i.dim, BlockDimension)]

        # Tunable arguments
        try:
            tunable = []
            tunable.append(generate_block_shapes(blockable, args, level))
            tunable.append(generate_nthreads(operator.nthreads, args, level))
            tunable = list(product(*tunable))
        except ValueError:
            # Some arguments are compulsory, otherwise autotuning is skipped
            continue

        # Symbolic number of loop-blocking blocks per thread
        nblocks_per_thread = calculate_nblocks(tree, blockable) / operator.nthreads

        for bs, nt in tunable:
            # Can we safely autotune over the given time range?
            if not check_time_bounds(stepper, at_args, args, mode):
                break

            # Update `at_args` to use the new tunable arguments
            run = [(k, v) for k, v in bs + nt if k in at_args]
            at_args.update(dict(run))

            # Drop run if not at least one block per thread
            if not configuration['develop-mode'] and nblocks_per_thread.subs(at_args) < 1:
                continue

            # Make sure we remain within stack bounds, otherwise skip run
            try:
                stack_footprint = operator._mem_summary['stack']
                if int(evaluate(stack_footprint, **at_args)) > options['stack_limit']:
                    continue
            except TypeError:
                warning("couldn't determine stack size; skipping run %s" % str(i))
                continue
            except AttributeError:
                assert stack_footprint == 0

            # Run the Operator
            operator.cfunction(*list(at_args.values()))
            elapsed = operator._profiler.timer.total

            timings.setdefault(nt, OrderedDict()).setdefault(n, {})[bs] = elapsed
            log("run <%s> took %f (s) in %d timesteps" %
                (','.join('%s=%s' % i for i in run), elapsed, timesteps))

            # Prepare for the next autotuning run
            update_time_bounds(stepper, at_args, timesteps, mode)

            # Reset profiling timers
            operator._profiler.timer.reset()

    # The best variant is the one that for a given number of threads had the minium
    # turnaround time
    try:
        runs = 0
        mapper = {}
        for k, v in timings.items():
            for i in v.values():
                runs += len(i)
                record = mapper.setdefault(k, Record())
                record.add(min(i, key=i.get), min(i.values()))
        best = min(mapper, key=mapper.get)
        best = OrderedDict(best + tuple(mapper[best].args))
        best.pop(None, None)
        log("selected <%s>" % (','.join('%s=%s' % i for i in best.items())))
    except ValueError:
        warning("couldn't perform any runs")
        return args, {}

    # Update the argument list with the tuned arguments
    args.update(best)

    # In `runtime` mode, some timesteps have been executed already, so we must
    # adjust the time range
    finalize_time_bounds(stepper, at_args, args, mode)

    # Autotuning summary
    summary = {}
    summary['runs'] = runs
    summary['tpr'] = timesteps  # tpr -> timesteps per run
    summary['tuned'] = dict(best)

    return args, summary