def peek(target, key): if type(target) == gpt.lattice: return gpt.mview(target[key]) elif type(target) == list: pos, tidx, shape = map_key(target, key) v_obj = [y for x in target for y in x.v_obj] return gpt.mview(cgpt.lattice_export(v_obj, pos, tidx, shape)) else: assert 0
def write_lattice(self, ctx, l): g = l.grid tag = (ctx + "\0").encode("utf-8") ntag = len(tag) nd = len(g.fdimensions) # create cartesian view for writing if "mpi" in self.params: mpi = self.params["mpi"] else: mpi = g.mpi cv0 = gpt.cartesian_view(-1, mpi, g.fdimensions, g.cb, l.checkerboard()) # file positions pos = numpy.array([0] * cv0.ranks, dtype=numpy.uint64) # describe res = g.describe() + " " + cv0.describe() + " " + l.describe() # find tasks for my node views_for_node = self.views_for_node(cv0, g) # performance dt_distr, dt_crc, dt_write = 0.0, 0.0, 0.0 #g.barrier() t0 = gpt.time() szGB = 0.0 # need to write all views for xk, iview in enumerate(views_for_node): f, p = self.open_view(xk, iview, True, mpi, g.fdimensions, g.cb, l.checkerboard()) # all nodes are needed to communicate dt_distr -= gpt.time() mv = gpt.mview(l[p]) dt_distr += gpt.time() # write data if not f is None: # description and data dt_crc -= gpt.time() crc = gpt.crc32(mv) dt_crc += gpt.time() dt_write -= gpt.time() pos[iview] = f.tell() f.write(ntag.to_bytes(4, byteorder='little')) f.write(tag) f.write(crc.to_bytes(4, byteorder='little')) f.write(nd.to_bytes(4, byteorder='little')) for i in range(nd): f.write(g.gdimensions[i].to_bytes(4, byteorder='little')) for i in range(nd): f.write(g.mpi[i].to_bytes(4, byteorder='little')) f.write(len(mv).to_bytes(8, byteorder='little')) f.write(mv) f.flush() dt_write += gpt.time() szGB += len(mv) / 1024.**3. t1 = gpt.time() szGB = g.globalsum(szGB) if self.verbose and dt_crc != 0.0: gpt.message( "Wrote %g GB at %g GB/s (%g GB/s for distribution, %g GB/s for checksum, %g GB/s for writing, %d views per node)" % (szGB, szGB / (t1 - t0), szGB / dt_distr, szGB / dt_crc, szGB / dt_write, len(views_for_node))) g.globalsum(pos) return res + " " + " ".join(["%d" % x for x in pos])
# grid L = [16, 16, 16, 32] grid_dp = g.grid(L, g.double) grid_sp = g.grid(L, g.single) # test fields l_dp = g.random("test").cnormal(g.vcolor(grid_dp)) l_sp = g.convert(l_dp, g.single) ################################################################################ # Test mview ################################################################################ c = g.coordinates(l_dp) x = l_dp[c] mv = g.mview(x) assert mv.itemsize == 1 and mv.shape[0] == len(mv) assert sys.getrefcount(x) == 3 del mv assert sys.getrefcount(x) == 2 ################################################################################ # Test assignments ################################################################################ pos = l_dp.mview_coordinates() lhs = g.lattice(l_dp) def assign_copy(): g.copy(lhs, l_dp)
def save(filename, objs, params): # split data to save assert len(objs) == 3 basis = objs[0] cevec = objs[1] ev = objs[2] # verbosity verbose = gpt.default.is_verbose("io") if verbose: gpt.message( "Saving %d basis vectors, %d coarse-grid vectors, %d eigenvalues to %s" % (len(basis), len(cevec), len(ev), filename)) # create directory if gpt.rank() == 0: os.makedirs(filename, exist_ok=True) # now sync since only root has created directory gpt.barrier() # write eigenvalues if gpt.rank() == 0: f = open("%s/eigen-values.txt" % filename, "wt") f.write("%d\n" % len(ev)) for v in ev: f.write("%.15E\n" % v) f.close() # site checkerboard # only odd is used in this file format but # would be easy to generalize here site_cb = gpt.odd # grids assert len(basis) > 0 assert len(cevec) > 0 fgrid = basis[0].grid cgrid = cevec[0].grid # mpi layout if "mpi" in params: mpi = params["mpi"] else: mpi = fgrid.mpi assert mpi[0] == 1 # assert no mpi in 5th direction # params assert basis[0].checkerboard() == site_cb nd = 5 assert len(fgrid.ldimensions) == nd fdimensions = fgrid.fdimensions ldimensions = [conformDiv(fdimensions[i], mpi[i]) for i in range(nd)] assert fgrid.precision == gpt.single s = ldimensions b = [ conformDiv(fgrid.fdimensions[i], cgrid.fdimensions[i]) for i in range(nd) ] nb = [conformDiv(s[i], b[i]) for i in range(nd)] neigen = len(cevec) nbasis = len(basis) if "nsingle" in params: nsingle = params["nsingle"] assert nsingle <= nbasis else: nsingle = nbasis nsingleCap = min([nsingle, nbasis]) blocks = numpy.prod(nb) FP16_COEF_EXP_SHARE_FLOATS = 10 # write metadata if gpt.rank() == 0: fmeta = open("%s/metadata.txt" % filename, "wt") for i in range(nd): fmeta.write("s[%d] = %d\n" % (i, s[(i + 1) % nd])) for i in range(nd): fmeta.write("b[%d] = %d\n" % (i, b[(i + 1) % nd])) for i in range(nd): fmeta.write("nb[%d] = %d\n" % (i, nb[(i + 1) % nd])) fmeta.write("neig = %d\n" % neigen) fmeta.write("nkeep = %d\n" % nbasis) fmeta.write("nkeep_single = %d\n" % nsingle) fmeta.write("blocks = %d\n" % blocks) fmeta.write("FP16_COEF_EXP_SHARE_FLOATS = %d\n" % FP16_COEF_EXP_SHARE_FLOATS) fmeta.flush() # write crc32 later # create cartesian view on fine grid cv0 = gpt.cartesian_view(-1, mpi, fdimensions, fgrid.cb, site_cb) views = cv0.views_for_node(fgrid) crc32 = numpy.array([0] * cv0.ranks, dtype=numpy.uint64) # timing t0 = gpt.time() totalSizeGB = 0 dt_fp16 = 1e-30 dt_distr = 1e-30 dt_munge = 1e-30 dt_crc = 1e-30 dt_fwrite = 1e-30 t0 = gpt.time() # load all views if verbose: gpt.message("Saving %s with %d views per node" % (filename, len(views))) for i, v in enumerate(views): cv = gpt.cartesian_view(v if v is not None else -1, mpi, fdimensions, fgrid.cb, site_cb) cvc = gpt.cartesian_view(v if v is not None else -1, mpi, cgrid.fdimensions, gpt.full, gpt.none) pos_coarse = gpt.coordinates(cvc, "canonical") dn, fn = get_local_name(filename, cv) if fn is not None: os.makedirs(dn, exist_ok=True) # sizes slot_lsites = numpy.prod(cv.view_dimensions) assert slot_lsites % blocks == 0 block_data_size_single = slot_lsites * 12 // 2 // blocks * 2 * 4 block_data_size_fp16 = FP_16_SIZE(slot_lsites * 12 // 2 // blocks * 2, 24) coarse_block_size_part_fp32 = 2 * (4 * nsingleCap) coarse_block_size_part_fp16 = 2 * (FP_16_SIZE( nbasis - nsingleCap, FP16_COEF_EXP_SHARE_FLOATS)) coarse_vector_size = (coarse_block_size_part_fp32 + coarse_block_size_part_fp16) * blocks totalSize = ( blocks * (block_data_size_single * nsingleCap + block_data_size_fp16 * (nbasis - nsingleCap)) + neigen * coarse_vector_size) totalSizeGB += totalSize / 1024.0**3.0 if v is not None else 0.0 # checksum crc32_comp = 0 # file f = gpt.FILE(fn, "wb") if fn is not None else None # block positions pos = [ cgpt.coordinates_from_block(cv.top, cv.bottom, b, nb, "canonicalOdd") for b in range(blocks) ] # group blocks read_blocks = blocks block_reduce = 1 max_read_blocks = get_param(params, "max_read_blocks", 8) while read_blocks > max_read_blocks and read_blocks % 2 == 0: pos = [ numpy.concatenate((pos[2 * i + 0], pos[2 * i + 1])) for i in range(read_blocks // 2) ] block_data_size_single *= 2 block_data_size_fp16 *= 2 read_blocks //= 2 block_reduce *= 2 # make read-only to enable caching for x in pos: x.setflags(write=0) # single-precision data data = memoryview(bytearray(block_data_size_single * nsingleCap)) reduced_size = len(data) // block_reduce for b in range(read_blocks): fgrid.barrier() dt_distr -= gpt.time() data_munged = gpt.peek( basis[0:nsingleCap], pos[b] ) # TODO: can already munge here using new index interface dt_distr += gpt.time() if f is not None: dt_munge -= gpt.time() for l in range(block_reduce): cgpt.munge_inner_outer( data[reduced_size * l:reduced_size * (l + 1)], data_munged[reduced_size * l:reduced_size * (l + 1)], nsingleCap, len(pos[b]) // block_reduce, ) dt_munge += gpt.time() dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() fgrid.barrier() dt_fwrite -= gpt.time() if f is not None: f.write(data) globalWriteGB = len(data) / 1024.0**3.0 else: globalWriteGB = 0.0 globalWriteGB = fgrid.globalsum(globalWriteGB) dt_fwrite += gpt.time() totalSizeGB += globalWriteGB if verbose: gpt.message( "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s" % ( totalSizeGB, totalSizeGB / dt_fwrite, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, )) # fp16 data if nbasis != nsingleCap: # allocate data buffer data_fp32 = memoryview( bytearray(block_data_size_single * (nbasis - nsingleCap))) data = memoryview( bytearray(block_data_size_fp16 * (nbasis - nsingleCap))) reduced_size = len(data_fp32) // block_reduce for b in range(read_blocks): fgrid.barrier() dt_distr -= gpt.time() data_munged = gpt.peek(basis[nsingleCap:nbasis], pos[b]) dt_distr += gpt.time() if f is not None: dt_munge -= gpt.time() for l in range(block_reduce): cgpt.munge_inner_outer( data_fp32[reduced_size * l:reduced_size * (l + 1)], data_munged[reduced_size * l:reduced_size * (l + 1)], nsingleCap, len(pos[b]) // block_reduce, ) dt_munge += gpt.time() dt_fp16 -= gpt.time() cgpt.fp32_to_fp16(data, data_fp32, 24) dt_fp16 += gpt.time() dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() fgrid.barrier() dt_fwrite -= gpt.time() if f is not None: f.write(data) globalWriteGB = len(data) / 1024.0**3.0 else: globalWriteGB = 0.0 globalWriteGB = fgrid.globalsum(globalWriteGB) dt_fwrite += gpt.time() totalSizeGB += globalWriteGB if verbose: gpt.message( "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s" % ( totalSizeGB, totalSizeGB / dt_fwrite, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, )) # coarse grid data data = memoryview(bytearray(coarse_vector_size)) for j in range(neigen): fgrid.barrier() dt_distr -= gpt.time() data_fp32 = gpt.mview(cevec[j][pos_coarse]) dt_distr += gpt.time() if f is not None: dt_fp16 -= gpt.time() cgpt.fp32_to_mixed_fp32fp16( data, data_fp32, coarse_block_size_part_fp32, coarse_block_size_part_fp16, FP16_COEF_EXP_SHARE_FLOATS, ) dt_fp16 += gpt.time() dt_crc -= gpt.time() crc32_comp = gpt.crc32(data, crc32_comp) dt_crc += gpt.time() fgrid.barrier() dt_fwrite -= gpt.time() if f is not None: f.write(data) globalWriteGB = len(data) / 1024.0**3.0 else: globalWriteGB = 0.0 globalWriteGB = fgrid.globalsum(globalWriteGB) dt_fwrite += gpt.time() totalSizeGB += globalWriteGB if verbose and j % (neigen // 10) == 0: gpt.message( "* write %g GB: fwrite at %g GB/s, crc32 at %g GB/s, munge at %g GB/s, distribute at %g GB/s, fp16 at %g GB/s" % ( totalSizeGB, totalSizeGB / dt_fwrite, totalSizeGB / dt_crc, totalSizeGB / dt_munge, totalSizeGB / dt_distr, totalSizeGB / dt_fp16, )) # save crc crc32[cv.rank] = crc32_comp # synchronize crc32 fgrid.globalsum(crc32) # timing t1 = gpt.time() # write crc to metadata if gpt.rank() == 0: for i in range(len(crc32)): fmeta.write("crc32[%d] = %X\n" % (i, crc32[i])) fmeta.close() # verbosity if verbose: gpt.message("* save %g GB at %g GB/s" % (totalSizeGB, totalSizeGB / (t1 - t0)))
for y in range(4): for z in range(8): for t in range(8): src[x, y, z, t] = g.vcomplex([x + t * 1j, y + t * 1j, z + t * 1j] * 10, 30) # now create a random partition of this lattice distributed over all nodes c = (g.coordinates(grid).copy().view(np.ndarray) ) # copy to make it writeable and lift local_coordinate type random.seed(13) for tr in range(10): shift = [random.randint(0, 8) for i in range(4)] for i in range(len(c)): for j in range(4): c[i][j] = (c[i][j] + shift[j]) % grid.gdimensions[j] data = src[c] # test global uniform memory system mvrestore = g.mview(data) err2 = 0.0 for i, pos in enumerate(c): for n in range(10): err2 += ((data[i][3 * n + 0].real - pos[0])**2.0 + (data[i][3 * n + 1].real - pos[1])**2.0 + (data[i][3 * n + 2].real - pos[2])**2.0) dst[c] = mvrestore err2 = grid.globalsum(err2) err1 = g.norm2(src - dst) g.message("Test shift", tr, "/ 10 :", shift, "difference norm/e2:", err1, err2) assert err1 == 0.0 and err2 == 0.0