def test_mpi_fullmode_objects(): grid = Grid(shape=(4, 4, 4)) x, y, _ = grid.dimensions # Message f = Function(name='f', grid=grid) obj = MPIMsgEnriched('msg', f, [Halo(x, LEFT)]) pkl_obj = pickle.dumps(obj) new_obj = pickle.loads(pkl_obj) assert obj.name == new_obj.name assert obj.target.name == new_obj.target.name assert all( obj.target.dimensions[i].name == new_obj.target.dimensions[i].name for i in range(grid.dim)) assert new_obj.target.dimensions[0] is new_obj.halos[0].dim # Region x_m, x_M = x.symbolic_min, x.symbolic_max y_m, y_M = y.symbolic_min, y.symbolic_max obj = MPIRegion('reg', 1, [y, x], [(((x, OWNED, LEFT), ), { x: (x_m, Min(x_M, x_m)) }), (((y, OWNED, LEFT), ), { y: (y_m, Min(y_M, y_m)) })]) pkl_obj = pickle.dumps(obj) new_obj = pickle.loads(pkl_obj) assert obj.prefix == new_obj.prefix assert obj.key == new_obj.key assert obj.name == new_obj.name assert len(new_obj.arguments) == 2 assert all(d0.name == d1.name for d0, d1 in zip(obj.arguments, new_obj.arguments)) assert all(new_obj.arguments[i] is new_obj.owned[i][0][0][0] # `x` and `y` for i in range(2)) assert new_obj.owned[0][0][0][1] is new_obj.owned[1][0][0][1] # `OWNED` assert new_obj.owned[0][0][0][2] is new_obj.owned[1][0][0][2] # `LEFT` for n, i in enumerate(new_obj.owned): d, v = list(i[1].items())[0] assert d is new_obj.arguments[n] assert v[0] is d.symbolic_min assert v[1] == Min(d.symbolic_max, d.symbolic_min)
def autotune(operator, args, level, mode): """ Operator autotuning. Parameters ---------- operator : Operator Input Operator. args : dict_like The runtime arguments with which `operator` is run. level : str The autotuning aggressiveness (basic, aggressive, max). A more aggressive autotuning might eventually result in higher runtime performance, but the autotuning phase will take longer. mode : str The autotuning mode (preemptive, runtime). In preemptive mode, the output runtime values supplied by the user to `operator.apply` are replaced with shadow copies. """ key = [level, mode] accepted = configuration._accepted['autotuning'] if key not in accepted: raise ValueError("The accepted `(level, mode)` combinations are `%s`; " "provided `%s` instead" % (accepted, key)) # We get passed all the arguments, but the cfunction only requires a subset at_args = OrderedDict([(p.name, args[p.name]) for p in operator.parameters]) # User-provided output data won't be altered in `preemptive` mode if mode == 'preemptive': output = {i.name: i for i in operator.output} copies = {k: output[k]._C_as_ndarray(v).copy() for k, v in args.items() if k in output} # WARNING: `copies` keeps references to numpy arrays, which is required # to avoid garbage collection to kick in during autotuning and prematurely # free the shadow copies handed over to C-land at_args.update({k: output[k]._C_make_dataobj(v) for k, v in copies.items()}) # Disable halo exchanges through MPI_PROC_NULL if mode in ['preemptive', 'destructive']: for p in operator.parameters: if isinstance(p, MPINeighborhood): at_args.update(MPINeighborhood(p.neighborhood)._arg_values()) for i in p.fields: setattr(at_args[p.name]._obj, i, MPI.PROC_NULL) elif isinstance(p, MPIMsgEnriched): at_args.update(MPIMsgEnriched(p.name, p.function, p.halos)._arg_values()) for i in at_args[p.name]: i.fromrank = MPI.PROC_NULL i.torank = MPI.PROC_NULL roots = [operator.body] + [i.root for i in operator._func_table.values()] trees = filter_ordered(retrieve_iteration_tree(roots), key=lambda i: i.root) # Detect the time-stepping Iteration; shrink its iteration range so that # each autotuning run only takes a few iterations steppers = {i for i in flatten(trees) if i.dim.is_Time} if len(steppers) == 0: stepper = None timesteps = 1 elif len(steppers) == 1: stepper = steppers.pop() timesteps = init_time_bounds(stepper, at_args, args) if timesteps is None: return args, {} else: warning("cannot perform autotuning unless there is one time loop; skipping") return args, {} # Perform autotuning timings = {} for n, tree in enumerate(trees): blockable = [i.dim for i in tree if isinstance(i.dim, BlockDimension)] # Tunable arguments try: tunable = [] tunable.append(generate_block_shapes(blockable, args, level)) tunable.append(generate_nthreads(operator.nthreads, args, level)) tunable = list(product(*tunable)) except ValueError: # Some arguments are compulsory, otherwise autotuning is skipped continue # Symbolic number of loop-blocking blocks per thread nblocks_per_thread = calculate_nblocks(tree, blockable) / operator.nthreads for bs, nt in tunable: # Can we safely autotune over the given time range? if not check_time_bounds(stepper, at_args, args, mode): break # Update `at_args` to use the new tunable arguments run = [(k, v) for k, v in bs + nt if k in at_args] at_args.update(dict(run)) # Drop run if not at least one block per thread if not configuration['develop-mode'] and nblocks_per_thread.subs(at_args) < 1: continue # Make sure we remain within stack bounds, otherwise skip run try: stack_footprint = operator._mem_summary['stack'] if int(evaluate(stack_footprint, **at_args)) > options['stack_limit']: continue except TypeError: warning("couldn't determine stack size; skipping run %s" % str(i)) continue except AttributeError: assert stack_footprint == 0 # Run the Operator operator.cfunction(*list(at_args.values())) elapsed = operator._profiler.timer.total timings.setdefault(nt, OrderedDict()).setdefault(n, {})[bs] = elapsed log("run <%s> took %f (s) in %d timesteps" % (','.join('%s=%s' % i for i in run), elapsed, timesteps)) # Prepare for the next autotuning run update_time_bounds(stepper, at_args, timesteps, mode) # Reset profiling timers operator._profiler.timer.reset() # The best variant is the one that for a given number of threads had the minium # turnaround time try: runs = 0 mapper = {} for k, v in timings.items(): for i in v.values(): runs += len(i) record = mapper.setdefault(k, Record()) record.add(min(i, key=i.get), min(i.values())) best = min(mapper, key=mapper.get) best = OrderedDict(best + tuple(mapper[best].args)) best.pop(None, None) log("selected <%s>" % (','.join('%s=%s' % i for i in best.items()))) except ValueError: warning("couldn't perform any runs") return args, {} # Update the argument list with the tuned arguments args.update(best) # In `runtime` mode, some timesteps have been executed already, so we must # adjust the time range finalize_time_bounds(stepper, at_args, args, mode) # Autotuning summary summary = {} summary['runs'] = runs summary['tpr'] = timesteps # tpr -> timesteps per run summary['tuned'] = dict(best) return args, summary