def main(ctx_factory, dim=2, order=3, visualize=False): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), force_device_scalars=True, ) comm = MPI.COMM_WORLD num_parts = comm.Get_size() from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis mesh_dist = MPIMeshDistributor(comm) nel_1d = 16 if mesh_dist.is_mananger_rank(): from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh(a=(-0.5, ) * dim, b=(0.5, ) * dim, nelements_per_axis=(nel_1d, ) * dim) logger.info("%d elements", mesh.nelements) part_per_element = get_partition_by_pymetis(mesh, num_parts) local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) del mesh else: local_mesh = mesh_dist.receive_mesh_part() dcoll = DiscretizationCollection(actx, local_mesh, order=order, mpi_communicator=comm) fields = flat_obj_array(bump(actx, dcoll), [dcoll.zeros(actx) for i in range(dcoll.dim)]) c = 1 dt = 0.45 * estimate_rk4_timestep(actx, dcoll, c) vis = make_visualizer(dcoll) def rhs(t, w): return wave_operator(dcoll, c=c, w=w) if comm.rank == 0: logger.info("dt = %g", dt) t = 0 t_final = 3 istep = 0 while t < t_final: fields = rk4_step(fields, t, dt, rhs) l2norm = op.norm(dcoll, fields[0], 2) if istep % 10 == 0: linfnorm = op.norm(dcoll, fields[0], np.inf) nodalmax = op.nodal_max(dcoll, "vol", fields[0]) nodalmin = op.nodal_min(dcoll, "vol", fields[0]) if comm.rank == 0: logger.info(f"step: {istep} t: {t} " f"L2: {l2norm} " f"Linf: {linfnorm} " f"sol max: {nodalmax} " f"sol min: {nodalmin}") if visualize: vis.write_parallel_vtk_file( comm, f"fld-wave-eager-mpi-{{rank:03d}}-{istep:04d}.vtu", [ ("u", fields[0]), ("v", fields[1:]), ]) t += dt istep += 1 # NOTE: These are here to ensure the solution is bounded for the # time interval specified assert l2norm < 1
def simple_wave_entrypoint(dim=2, num_elems=256, order=4, num_steps=30, log_filename="grudge.dat"): cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) from mpi4py import MPI comm = MPI.COMM_WORLD num_parts = comm.Get_size() n = int(num_elems**(1. / dim)) from meshmode.distributed import MPIMeshDistributor mesh_dist = MPIMeshDistributor(comm) if mesh_dist.is_mananger_rank(): from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh(a=(-0.5, ) * dim, b=(0.5, ) * dim, n=(n, ) * dim) from pymetis import part_graph _, p = part_graph(num_parts, xadj=mesh.nodal_adjacency.neighbors_starts.tolist(), adjncy=mesh.nodal_adjacency.neighbors.tolist()) part_per_element = np.array(p) local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) else: local_mesh = mesh_dist.receive_mesh_part() vol_discr = DiscretizationCollection(cl_ctx, local_mesh, order=order, mpi_communicator=comm) source_center = np.array([0.1, 0.22, 0.33])[:local_mesh.dim] source_width = 0.05 source_omega = 3 sym_x = sym.nodes(local_mesh.dim) sym_source_center_dist = sym_x - source_center sym_t = sym.ScalarVariable("t") from grudge.models.wave import StrongWaveOperator from meshmode.mesh import BTAG_ALL, BTAG_NONE op = StrongWaveOperator( -0.1, vol_discr.dim, source_f=( sym.sin(source_omega * sym_t) * sym.exp(-np.dot(sym_source_center_dist, sym_source_center_dist) / source_width**2)), dirichlet_tag=BTAG_NONE, neumann_tag=BTAG_NONE, radiation_tag=BTAG_ALL, flux_type="upwind") from pytools.obj_array import join_fields fields = join_fields( vol_discr.zeros(queue), [vol_discr.zeros(queue) for i in range(vol_discr.dim)]) from logpyle import LogManager, \ add_general_quantities, \ add_run_info, \ IntervalTimer, EventCounter # NOTE: LogManager hangs when using a file on a shared directory. logmgr = LogManager(log_filename, "w", comm) add_run_info(logmgr) add_general_quantities(logmgr) log_quantities =\ {"rank_data_swap_timer": IntervalTimer("rank_data_swap_timer", "Time spent evaluating RankDataSwapAssign"), "rank_data_swap_counter": EventCounter("rank_data_swap_counter", "Number of RankDataSwapAssign instructions evaluated"), "exec_timer": IntervalTimer("exec_timer", "Total time spent executing instructions"), "insn_eval_timer": IntervalTimer("insn_eval_timer", "Time spend evaluating instructions"), "future_eval_timer": IntervalTimer("future_eval_timer", "Time spent evaluating futures"), "busy_wait_timer": IntervalTimer("busy_wait_timer", "Time wasted doing busy wait")} for quantity in log_quantities.values(): logmgr.add_quantity(quantity) bound_op = bind(vol_discr, op.sym_operator()) def rhs(t, w): val, rhs.profile_data = bound_op(queue, profile_data=rhs.profile_data, log_quantities=log_quantities, t=t, w=w) return val rhs.profile_data = {} dt = 0.04 dt_stepper = set_up_rk4("w", dt, fields, rhs) logmgr.tick_before() for event in dt_stepper.run(t_end=dt * num_steps): if isinstance(event, dt_stepper.StateComputed): logmgr.tick_after() logmgr.tick_before() logmgr.tick_after() def print_profile_data(data): print("""execute() for rank %d: \tInstruction Evaluation: %f%% \tFuture Evaluation: %f%% \tBusy Wait: %f%% \tTotal: %f seconds""" % (comm.Get_rank(), data['insn_eval_time'] / data['total_time'] * 100, data['future_eval_time'] / data['total_time'] * 100, data['busy_wait_time'] / data['total_time'] * 100, data['total_time'])) print_profile_data(rhs.profile_data) logmgr.close()
def main(ctx_factory, dim=2, order=4, visualize=False): cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), force_device_scalars=True, ) comm = MPI.COMM_WORLD num_parts = comm.Get_size() from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis mesh_dist = MPIMeshDistributor(comm) if mesh_dist.is_mananger_rank(): from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh( a=(-0.5,)*dim, b=(0.5,)*dim, nelements_per_axis=(16,)*dim) logger.info("%d elements", mesh.nelements) part_per_element = get_partition_by_pymetis(mesh, num_parts) local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) del mesh else: local_mesh = mesh_dist.receive_mesh_part() dcoll = DiscretizationCollection(actx, local_mesh, order=order, mpi_communicator=comm) def source_f(actx, dcoll, t=0): source_center = np.array([0.1, 0.22, 0.33])[:dcoll.dim] source_width = 0.05 source_omega = 3 nodes = thaw(dcoll.nodes(), actx) source_center_dist = flat_obj_array( [nodes[i] - source_center[i] for i in range(dcoll.dim)] ) return ( np.sin(source_omega*t) * actx.np.exp( -np.dot(source_center_dist, source_center_dist) / source_width**2 ) ) from grudge.models.wave import WeakWaveOperator from meshmode.mesh import BTAG_ALL, BTAG_NONE wave_op = WeakWaveOperator( dcoll, 0.1, source_f=source_f, dirichlet_tag=BTAG_NONE, neumann_tag=BTAG_NONE, radiation_tag=BTAG_ALL, flux_type="upwind" ) fields = flat_obj_array( dcoll.zeros(actx), [dcoll.zeros(actx) for i in range(dcoll.dim)] ) dt = actx.to_numpy( 2/3 * wave_op.estimate_rk4_timestep(actx, dcoll, fields=fields)) wave_op.check_bc_coverage(local_mesh) def rhs(t, w): return wave_op.operator(t, w) dt_stepper = set_up_rk4("w", dt, fields, rhs) final_t = 10 nsteps = int(final_t/dt) + 1 if comm.rank == 0: logger.info("dt=%g nsteps=%d", dt, nsteps) from grudge.shortcuts import make_visualizer vis = make_visualizer(dcoll) step = 0 def norm(u): return op.norm(dcoll, u, 2) from time import time t_last_step = time() if visualize: u = fields[0] v = fields[1:] vis.write_parallel_vtk_file( comm, f"fld-wave-min-mpi-{{rank:03d}}-{step:04d}.vtu", [ ("u", u), ("v", v), ] ) for event in dt_stepper.run(t_end=final_t): if isinstance(event, dt_stepper.StateComputed): assert event.component_id == "w" step += 1 l2norm = norm(u=event.state_component[0]) if step % 10 == 0: if comm.rank == 0: logger.info(f"step: {step} " f"t: {time()-t_last_step} " f"L2: {l2norm}") if visualize: vis.write_parallel_vtk_file( comm, f"fld-wave-min-mpi-{{rank:03d}}-{step:04d}.vtu", [ ("u", event.state_component[0]), ("v", event.state_component[1:]), ] ) t_last_step = time() # NOTE: These are here to ensure the solution is bounded for the # time interval specified assert l2norm < 1
def main(snapshot_pattern="wave-mpi-{step:04d}-{rank:04d}.pkl", restart_step=None, use_profiling=False, use_logmgr=False, actx_class=PyOpenCLArrayContext): """Drive the example.""" cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() num_parts = comm.Get_size() logmgr = initialize_logmgr(use_logmgr, filename="wave-mpi.sqlite", mode="wu", mpi_comm=comm) if use_profiling: queue = cl.CommandQueue( cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) actx = actx_class(queue, allocator=cl_tools.MemoryPool( cl_tools.ImmediateAllocator(queue)), logmgr=logmgr) else: queue = cl.CommandQueue(cl_ctx) actx = actx_class(queue, allocator=cl_tools.MemoryPool( cl_tools.ImmediateAllocator(queue))) if restart_step is None: from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis mesh_dist = MPIMeshDistributor(comm) dim = 2 nel_1d = 16 if mesh_dist.is_mananger_rank(): from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh(a=(-0.5, ) * dim, b=(0.5, ) * dim, nelements_per_axis=(nel_1d, ) * dim) print("%d elements" % mesh.nelements) part_per_element = get_partition_by_pymetis(mesh, num_parts) local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) del mesh else: local_mesh = mesh_dist.receive_mesh_part() fields = None else: from mirgecom.restart import read_restart_data restart_data = read_restart_data( actx, snapshot_pattern.format(step=restart_step, rank=rank)) local_mesh = restart_data["local_mesh"] nel_1d = restart_data["nel_1d"] assert comm.Get_size() == restart_data["num_parts"] order = 3 discr = EagerDGDiscretization(actx, local_mesh, order=order, mpi_communicator=comm) current_cfl = 0.485 wave_speed = 1.0 from grudge.dt_utils import characteristic_lengthscales nodal_dt = characteristic_lengthscales(actx, discr) / wave_speed from grudge.op import nodal_min dt = actx.to_numpy(current_cfl * nodal_min(discr, "vol", nodal_dt))[()] t_final = 1 if restart_step is None: t = 0 istep = 0 fields = flat_obj_array(bump(actx, discr), [discr.zeros(actx) for i in range(discr.dim)]) else: t = restart_data["t"] istep = restart_step assert istep == restart_step restart_fields = restart_data["fields"] old_order = restart_data["order"] if old_order != order: old_discr = EagerDGDiscretization(actx, local_mesh, order=old_order, mpi_communicator=comm) from meshmode.discretization.connection import make_same_mesh_connection connection = make_same_mesh_connection( actx, discr.discr_from_dd("vol"), old_discr.discr_from_dd("vol")) fields = connection(restart_fields) else: fields = restart_fields if logmgr: logmgr_add_cl_device_info(logmgr, queue) logmgr_add_device_memory_usage(logmgr, queue) logmgr.add_watches(["step.max", "t_step.max", "t_log.max"]) try: logmgr.add_watches( ["memory_usage_python.max", "memory_usage_gpu.max"]) except KeyError: pass if use_profiling: logmgr.add_watches(["multiply_time.max"]) vis_timer = IntervalTimer("t_vis", "Time spent visualizing") logmgr.add_quantity(vis_timer) vis = make_visualizer(discr) def rhs(t, w): return wave_operator(discr, c=wave_speed, w=w) compiled_rhs = actx.compile(rhs) while t < t_final: if logmgr: logmgr.tick_before() # restart must happen at beginning of step if istep % 100 == 0 and ( # Do not overwrite the restart file that we just read. istep != restart_step): from mirgecom.restart import write_restart_file write_restart_file(actx, restart_data={ "local_mesh": local_mesh, "order": order, "fields": fields, "t": t, "step": istep, "nel_1d": nel_1d, "num_parts": num_parts }, filename=snapshot_pattern.format(step=istep, rank=rank), comm=comm) if istep % 10 == 0: print(istep, t, actx.to_numpy(discr.norm(fields[0]))) vis.write_parallel_vtk_file(comm, "fld-wave-mpi-%03d-%04d.vtu" % (rank, istep), [ ("u", fields[0]), ("v", fields[1:]), ], overwrite=True) fields = thaw(freeze(fields, actx), actx) fields = rk4_step(fields, t, dt, compiled_rhs) t += dt istep += 1 if logmgr: set_dt(logmgr, dt) logmgr.tick_after() final_soln = actx.to_numpy(discr.norm(fields[0])) assert np.abs(final_soln - 0.04409852463947439) < 1e-14
def _test_mpi_boundary_swap(dim, order, num_groups): from meshmode.distributed import MPIMeshDistributor, MPIBoundaryCommSetupHelper from mpi4py import MPI mpi_comm = MPI.COMM_WORLD i_local_part = mpi_comm.Get_rank() num_parts = mpi_comm.Get_size() mesh_dist = MPIMeshDistributor(mpi_comm) if mesh_dist.is_mananger_rank(): np.random.seed(42) from meshmode.mesh.generation import generate_warped_rect_mesh meshes = [ generate_warped_rect_mesh(dim, order=order, n=4) for _ in range(num_groups) ] if num_groups > 1: from meshmode.mesh.processing import merge_disjoint_meshes mesh = merge_disjoint_meshes(meshes) else: mesh = meshes[0] part_per_element = np.random.randint(num_parts, size=mesh.nelements) local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) else: local_mesh = mesh_dist.receive_mesh_part() group_factory = PolynomialWarpAndBlendGroupFactory(order) from meshmode.array_context import PyOpenCLArrayContext cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext(queue) from meshmode.discretization import Discretization vol_discr = Discretization(actx, local_mesh, group_factory) from meshmode.distributed import get_connected_partitions connected_parts = get_connected_partitions(local_mesh) assert i_local_part not in connected_parts bdry_setup_helpers = {} local_bdry_conns = {} from meshmode.discretization.connection import make_face_restriction from meshmode.mesh import BTAG_PARTITION for i_remote_part in connected_parts: local_bdry_conns[i_remote_part] = make_face_restriction( actx, vol_discr, group_factory, BTAG_PARTITION(i_remote_part)) setup_helper = bdry_setup_helpers[i_remote_part] = \ MPIBoundaryCommSetupHelper( mpi_comm, actx, local_bdry_conns[i_remote_part], i_remote_part, bdry_grp_factory=group_factory) setup_helper.post_sends() remote_to_local_bdry_conns = {} from meshmode.discretization.connection import check_connection while bdry_setup_helpers: for i_remote_part, setup_helper in bdry_setup_helpers.items(): if setup_helper.is_setup_ready(): assert bdry_setup_helpers.pop(i_remote_part) is setup_helper conn = setup_helper.complete_setup() check_connection(actx, conn) remote_to_local_bdry_conns[i_remote_part] = conn break # FIXME: Not ideal, busy-waits _test_data_transfer(mpi_comm, actx, local_bdry_conns, remote_to_local_bdry_conns, connected_parts) logger.debug("Rank %d exiting", i_local_part)