def test_simple_tiling(self, ker_init, ker_reduce_ind_read, ker_write, ker_write2d, iterset, indset, iterset2indset, ix2, x, y, z, skip_greedy, nu, ts): """Check that tiling produces the correct output in a sequence of four loops. First two loops are soft-fusible; the remaining three loops are fused through tiling. Multiple tile sizes (ts) and unroll factors (nu) are tried to check the correctness of different fusion strategies.""" def time_loop_body(): op2.par_loop(op2.Kernel(ker_init, "ker_init"), iterset, y(op2.WRITE)) op2.par_loop(op2.Kernel(ker_write, "ker_write"), iterset, z(op2.WRITE)) op2.par_loop(op2.Kernel(ker_write2d, "ker_write2d"), indset, ix2(op2.WRITE)) op2.par_loop(op2.Kernel(ker_reduce_ind_read, "ker_reduce_ind_read"), iterset, y(op2.INC), ix2(op2.READ, iterset2indset), z(op2.READ)) # Tiling is skipped until the same sequence is seen three times for t in range(2): with loop_chain("simple_nu%d" % nu, mode='tile', tile_size=ts, num_unroll=nu): time_loop_body() assert sum(y.data) == nelems * 3 for t in range(4): with loop_chain("simple_nu%d" % nu, mode='tile', tile_size=ts, num_unroll=nu): time_loop_body() assert sum(y.data) == nelems * 3
def test_acyclic_raw_dependency(self, ker_ind_inc, ker_write, iterset, bigiterset, indset, iterset2indset, indset2iterset, bigiterset2iterset, x, y, bigx, ix, sl, skip_greedy): """Check that tiling produces the correct output in a sequence of loops characterized by read-after-write dependencies. SLOPE is told to ignore write-after-read dependencies; this test shows that the resulting inspector/executor scheme created through SLOPE is anyway correct.""" # Tiling is skipped until the same sequence is seen three times for t in range(3): op2.par_loop(op2.Kernel(ker_write, "ker_write"), iterset, x(op2.WRITE)) op2.par_loop(op2.Kernel(ker_write, "ker_write"), iterset, y(op2.WRITE)) op2.par_loop(op2.Kernel(ker_write, "ker_write"), bigiterset, bigx(op2.WRITE)) op2.par_loop(op2.Kernel(ker_write, "ker_write"), indset, ix(op2.WRITE)) with loop_chain("tiling_acyclic_raw", mode='tile', tile_size=nelems//10, num_unroll=1, seed_loop=sl, ignore_war=True): op2.par_loop(op2.Kernel(ker_ind_inc, 'ker_ind_inc'), bigiterset, x(op2.INC, bigiterset2iterset), bigx(op2.READ)) op2.par_loop(op2.Kernel(ker_ind_inc, 'ker_ind_inc'), iterset, ix(op2.INC, iterset2indset), x(op2.READ)) op2.par_loop(op2.Kernel(ker_ind_inc, 'ker_ind_inc'), indset, y(op2.INC, indset2iterset), ix(op2.READ)) assert sum(x.data) == nelems * 3 assert sum(ix.data) == nelems * 4 assert sum(y.data) == nelems * 5
def test_advanced_tiling(self, ker_init, ker_reduce_ind_read, ker_ind_reduce, ker_write, ker_write2d, ker_inc, iterset, indset, iterset2indset, indset2iterset, ix2, y, z, skip_greedy, nu, ts, fs, sl): """Check that tiling produces the correct output in a sequence of six loops. Loops perform direct writes, direct increments, and indirect increments; both RAW and WAR dependencies are present. Multiple tile sizes (ts), unroll factors (nu), and fusion schemes (fs) are tried to check the correctness of different optimization strategies.""" # Tiling is skipped until the same sequence is seen three times for t in range(4): with loop_chain("advanced_nu%d" % nu, mode='tile', tile_size=ts, num_unroll=nu, explicit_mode=fs, seed_loop=sl): op2.par_loop(op2.Kernel(ker_init, "ker_init"), iterset, y(op2.WRITE)) op2.par_loop(op2.Kernel(ker_write, "ker_write"), iterset, z(op2.WRITE)) op2.par_loop(op2.Kernel(ker_write2d, "ker_write2d"), indset, ix2(op2.WRITE)) op2.par_loop(op2.Kernel(ker_reduce_ind_read, "ker_reduce_ind_read"), iterset, y(op2.INC), ix2(op2.READ, iterset2indset), z(op2.READ)) op2.par_loop(op2.Kernel(ker_ind_reduce, "ker_ind_reduce"), indset, ix2(op2.INC), y(op2.READ, indset2iterset)) op2.par_loop(op2.Kernel(ker_reduce_ind_read, "ker_reduce_ind_read"), iterset, z(op2.INC), ix2(op2.READ, iterset2indset), y(op2.READ)) assert sum(z.data) == nelems * 27 + nelems assert sum(y.data) == nelems * 3 assert sum(sum(ix2.data)) == nelems * 9
def test_war_dependency(self, ker_ind_reduce, ker_reduce_ind_read, ker_write, ker_write2d, iterset, indset, sl, iterset2indset, indset2iterset, x, y, ix2, skip_greedy): """Check that tiling works properly in presence of write-after-read dependencies.""" op2.par_loop(op2.Kernel(ker_write, "ker_write"), iterset, y(op2.WRITE)) # Tiling is skipped until the same sequence is seen three times for t in range(3): op2.par_loop(op2.Kernel(ker_write, "ker_write"), iterset, x(op2.WRITE)) op2.par_loop(op2.Kernel(ker_write2d, "ker_write2d"), indset, ix2(op2.WRITE)) with loop_chain("tiling_war", mode='tile', tile_size=nelems // 10, num_unroll=1, seed_loop=sl): op2.par_loop(op2.Kernel(ker_ind_reduce, "ker_ind_reduce"), indset, ix2(op2.INC), x(op2.READ, indset2iterset)) op2.par_loop( op2.Kernel(ker_reduce_ind_read, "ker_reduce_ind_read"), iterset, x(op2.INC), ix2(op2.READ, iterset2indset), y(op2.READ)) assert sum(sum(ix2.data)) == nelems * (1 + 2) + nelems * 2 assert sum(x.data) == sum(sum(ix2.data)) + nelems
def test_war_dependency(self, ker_ind_reduce, ker_reduce_ind_read, ker_write, ker_write2d, iterset, indset, sl, iterset2indset, indset2iterset, x, y, ix2, skip_greedy): """Check that tiling works properly in presence of write-after-read dependencies.""" op2.par_loop(op2.Kernel(ker_write, "ker_write"), iterset, y(op2.WRITE)) # Tiling is skipped until the same sequence is seen three times for t in range(3): op2.par_loop(op2.Kernel(ker_write, "ker_write"), iterset, x(op2.WRITE)) op2.par_loop(op2.Kernel(ker_write2d, "ker_write2d"), indset, ix2(op2.WRITE)) with loop_chain("tiling_war", mode='tile', tile_size=nelems//10, num_unroll=1, seed_loop=sl): op2.par_loop(op2.Kernel(ker_ind_reduce, "ker_ind_reduce"), indset, ix2(op2.INC), x(op2.READ, indset2iterset)) op2.par_loop(op2.Kernel(ker_reduce_ind_read, "ker_reduce_ind_read"), iterset, x(op2.INC), ix2(op2.READ, iterset2indset), y(op2.READ)) assert sum(sum(ix2.data)) == nelems * (1 + 2) + nelems * 2 assert sum(x.data) == sum(sum(ix2.data)) + nelems
def run(self, T, TS=0): """ Run the elastic wave simulation until t = T or ntimesteps = TS. :param float T: The finish time of the simulation. :param float TS: The maximum number of timesteps performed; ignored if = 0. :returns: The final solution fields for velocity and stress. """ # Write out the initial condition. self.write(self.u1, self.s1, self.tofile) info("Generating inverse mass matrix") # Pre-assemble the inverse mass matrices, which should stay # constant throughout the simulation (assuming no mesh adaptivity). start = time() self.assemble_inverse_mass() end = time() info("DONE! (Elapsed: %f s)" % round(end - start, 3)) op2.MPI.COMM_WORLD.barrier() info("Copying inverse mass matrix into a dat...") start = time() self.copy_massmatrix_into_dat() end = time() info("DONE! (Elapsed: %f s)" % round(end - start, 3)) op2.MPI.COMM_WORLD.barrier() start = time() t = self.dt timestep = 0 ntimesteps = sys.maxint if TS == 0 else TS while t <= T + 1e-12 and timestep < ntimesteps: if op2.MPI.COMM_WORLD.rank == 0 and timestep % self.output == 0: info("t = %f, (timestep = %d)" % (t, timestep)) with loop_chain("main1", tile_size=self.tiling_size, num_unroll=self.tiling_uf, mode=self.tiling_mode, extra_halo=self.tiling_halo, explicit=self.tiling_explicit, use_glb_maps=self.tiling_glb_maps, use_prefetch=self.tiling_prefetch, coloring=self.tiling_coloring, ignore_war=True, log=self.tiling_log): # In case the source is time-dependent, update the time 't' here. if (self.source): with timed_region('source term update'): self.source_expression.t = t self.source = self.source_expression # Solve for the velocity vector field. self.solve(self.rhs_uh1, self.velocity_mass_asdat, self.uh1) self.solve(self.rhs_stemp, self.stress_mass_asdat, self.stemp) self.solve(self.rhs_uh2, self.velocity_mass_asdat, self.uh2) self.solve(self.rhs_u1, self.velocity_mass_asdat, self.u1) # Solve for the stress tensor field. self.solve(self.rhs_sh1, self.stress_mass_asdat, self.sh1) self.solve(self.rhs_utemp, self.velocity_mass_asdat, self.utemp) self.solve(self.rhs_sh2, self.stress_mass_asdat, self.sh2) self.solve(self.rhs_s1, self.stress_mass_asdat, self.s1) self.u0.assign(self.u1) self.s0.assign(self.s1) # Write out the new fields self.write(self.u1, self.s1, self.tofile and timestep % self.output == 0) # Move onto next timestep t += self.dt timestep += 1 # Write out the final state of the fields self.write(self.u1, self.s1, self.tofile) end = time() return start, end, timestep, self.u1, self.s1